diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 42497c6be9567..91eae7ca3c442 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,4 @@
+<<<<<<< HEAD
 # Contributing
 
 ## License
@@ -92,3 +93,15 @@ Merge of pull request is done only by project maintainers. There are three optio
    Squashing is done to shorten history and make sure that the project is buildable on any commit.
  - [Create a merge commit] Used for pull down PRs to avoid duplication of
    LLVM commits.
+=======
+# Contributing to LLVM
+
+Thank you for your interest in contributing to LLVM! There are many ways to
+contribute, and we appreciate all contributions.
+
+To get started with contributing, please take a look at the
+[Contributing to LLVM](https://llvm.org/docs/Contributing.html) guide. It
+describes how to get involved, raise issues and submit patches. Please note
+that at the moment the LLVM project does not use either Github pull requests
+or Github issues.
+>>>>>>> effcdc3a82f2a32829170e7f7a2ff3d7853b612d
diff --git a/README.md b/README.md
index e09fe289dd8af..0e4e25176b9a9 100644
--- a/README.md
+++ b/README.md
@@ -2,9 +2,18 @@
 
 ## Introduction
 
+<<<<<<< HEAD
 Intel staging area for llvm.org contribution.
 Home for Intel LLVM-based projects:
  - SYCL* Compiler and Runtimes - compiler and runtime libraries for SYCL ([https://www.khronos.org/sycl/](https://www.khronos.org/sycl/)). See **sycl** branch.
+=======
+The README briefly describes how to get started with building LLVM.
+For more information on how to contribute to the LLVM project, please
+take a look at the
+[Contributing to LLVM](https://llvm.org/docs/Contributing.html) guide.
+
+## Getting Started with the LLVM System
+>>>>>>> effcdc3a82f2a32829170e7f7a2ff3d7853b612d
 
 ## License
 See [LICENSE.txt](sycl/LICENSE.TXT) for details.
diff --git a/clang-tools-extra/clang-include-fixer/tool/clang-include-fixer.py b/clang-tools-extra/clang-include-fixer/tool/clang-include-fixer.py
index df05101e4fd8c..e3a52f094f663 100644
--- a/clang-tools-extra/clang-include-fixer/tool/clang-include-fixer.py
+++ b/clang-tools-extra/clang-include-fixer/tool/clang-include-fixer.py
@@ -17,6 +17,7 @@
 # It operates on the current, potentially unsaved buffer and does not create
 # or save any files. To revert a fix, just undo.
 
+from __future__ import print_function
 import argparse
 import difflib
 import json
@@ -79,7 +80,7 @@ def GetUserSelection(message, headers, maximum_suggested_headers):
     except Exception:
       # Show a new prompt on invalid option instead of aborting so that users
       # don't need to wait for another clang-include-fixer run.
-      print >> sys.stderr, "Invalid option:", res
+      print("Invalid option: {}".format(res), file=sys.stderr)
       return GetUserSelection(message, headers, maximum_suggested_headers)
   return headers[idx - 1]
 
@@ -95,7 +96,7 @@ def execute(command, text):
   p = subprocess.Popen(command,
                        stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                        stdin=subprocess.PIPE, startupinfo=startupinfo)
-  return p.communicate(input=text)
+  return p.communicate(input=text.encode('utf-8'))
 
 
 def InsertHeaderToVimBuffer(header, text):
@@ -159,7 +160,7 @@ def main():
   if query_mode:
     symbol = get_symbol_under_cursor()
     if len(symbol) == 0:
-      print "Skip querying empty symbol."
+      print("Skip querying empty symbol.")
       return
     command = [binary, "-stdin", "-query-symbol="+get_symbol_under_cursor(),
                "-db=" + args.db, "-input=" + args.input,
@@ -170,13 +171,14 @@ def main():
                "-input=" + args.input, vim.current.buffer.name]
   stdout, stderr = execute(command, text)
   if stderr:
-    print >> sys.stderr, "Error while running clang-include-fixer: " + stderr
+    print("Error while running clang-include-fixer: {}".format(stderr),
+          file=sys.stderr)
     return
 
   include_fixer_context = json.loads(stdout)
   query_symbol_infos = include_fixer_context["QuerySymbolInfos"]
   if not query_symbol_infos:
-    print "The file is fine, no need to add a header."
+    print("The file is fine, no need to add a header.")
     return
   symbol = query_symbol_infos[0]["RawIdentifier"]
   # The header_infos is already sorted by clang-include-fixer.
@@ -192,7 +194,7 @@ def main():
       unique_headers.append(header)
 
   if not unique_headers:
-    print "Couldn't find a header for {0}.".format(symbol)
+    print("Couldn't find a header for {0}.".format(symbol))
     return
 
   try:
@@ -207,9 +209,9 @@ def main():
     include_fixer_context["HeaderInfos"] = inserted_header_infos
 
     InsertHeaderToVimBuffer(include_fixer_context, text)
-    print "Added #include {0} for {1}.".format(selected, symbol)
+    print("Added #include {0} for {1}.".format(selected, symbol))
   except Exception as error:
-    print >> sys.stderr, error.message
+    print(error, file=sys.stderr)
   return
 
 
diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 91e8ebee13686..40aaf402ec0e1 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -314,10 +314,8 @@ ClangTidyASTConsumerFactory::ClangTidyASTConsumerFactory(
     IntrusiveRefCntPtr<llvm::vfs::OverlayFileSystem> OverlayFS)
     : Context(Context), OverlayFS(OverlayFS),
       CheckFactories(new ClangTidyCheckFactories) {
-  for (ClangTidyModuleRegistry::iterator I = ClangTidyModuleRegistry::begin(),
-                                         E = ClangTidyModuleRegistry::end();
-       I != E; ++I) {
-    std::unique_ptr<ClangTidyModule> Module(I->instantiate());
+  for (ClangTidyModuleRegistry::entry E : ClangTidyModuleRegistry::entries()) {
+    std::unique_ptr<ClangTidyModule> Module = E.instantiate();
     Module->addCheckFactories(*CheckFactories);
   }
 }
diff --git a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
index 7ca5c1e3454b1..8d4366b51a3ec 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MacroParenthesesCheck.cpp
@@ -54,7 +54,7 @@ static bool isSurroundedRight(const Token &T) {
 /// Is given TokenKind a keyword?
 static bool isKeyword(const Token &T) {
   // FIXME: better matching of keywords to avoid false positives.
-  return T.isOneOf(tok::kw_case, tok::kw_const, tok::kw_struct);
+  return T.isOneOf(tok::kw_if, tok::kw_case, tok::kw_const, tok::kw_struct);
 }
 
 /// Warning is written when one of these operators are not within parentheses.
diff --git a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
index d94731beba945..9b34f5ab55a7f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousSemicolonCheck.cpp
@@ -20,7 +20,8 @@ namespace bugprone {
 void SuspiciousSemicolonCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       stmt(anyOf(ifStmt(hasThen(nullStmt().bind("semi")),
-                        unless(hasElse(stmt()))),
+                        unless(hasElse(stmt())),
+                        unless(isConstexpr())),
                  forStmt(hasBody(nullStmt().bind("semi"))),
                  cxxForRangeStmt(hasBody(nullStmt().bind("semi"))),
                  whileStmt(hasBody(nullStmt().bind("semi")))))
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
index 2d4475c991ca2..c9313dbae96a5 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
@@ -14,11 +14,12 @@
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -34,45 +35,270 @@ namespace modernize {
 namespace {
 
 enum BindArgumentKind { BK_Temporary, BK_Placeholder, BK_CallExpr, BK_Other };
+enum CaptureMode { CM_None, CM_ByRef, CM_ByValue, CM_InitExpression };
+
+enum CallableType {
+  CT_Other,          // unknown
+  CT_Function,       // global or static function
+  CT_MemberFunction, // member function with implicit this
+  CT_Object,         // object with operator()
+};
+
+enum CallableMaterializationKind {
+  CMK_Other,       // unknown
+  CMK_Function,    // callable is the name of a member or non-member function.
+  CMK_VariableRef, // callable is a simple expression involving a global or
+                   // local variable.
+  CMK_CallExpression, // callable is obtained as the result of a call expression
+};
 
 struct BindArgument {
-  StringRef Tokens;
+  // A rough classification of the type of expression this argument was.
   BindArgumentKind Kind = BK_Other;
+
+  // If this argument required a capture, a value indicating how it was
+  // captured.
+  CaptureMode CM = CM_None;
+
+  // The exact spelling of this argument in the source code.
+  StringRef SourceTokens;
+
+  // The identifier of the variable within the capture list.  This may be
+  // different from UsageIdentifier for example in the expression *d, where the
+  // variable is captured as d, but referred to as *d.
+  std::string CaptureIdentifier;
+
+  // If this is a placeholder or capture init expression, contains the tokens
+  // used to refer to this parameter from within the body of the lambda.
+  std::string UsageIdentifier;
+
+  // If Kind == BK_Placeholder, the index of the placeholder.
   size_t PlaceHolderIndex = 0;
+
+  // True if the argument is used inside the lambda, false otherwise.
+  bool IsUsed = false;
+
+  // The actual Expr object representing this expression.
+  const Expr *E = nullptr;
+};
+
+struct CallableInfo {
+  CallableType Type = CT_Other;
+  CallableMaterializationKind Materialization = CMK_Other;
+  CaptureMode CM = CM_None;
+  StringRef SourceTokens;
+  std::string CaptureIdentifier;
+  std::string UsageIdentifier;
+  StringRef CaptureInitializer;
+  const FunctionDecl *Decl = nullptr;
+};
+
+struct LambdaProperties {
+  CallableInfo Callable;
+  SmallVector<BindArgument, 4> BindArguments;
+  StringRef BindNamespace;
+  bool IsFixitSupported = false;
 };
 
 } // end namespace
 
+static const Expr *ignoreTemporariesAndPointers(const Expr *E) {
+  if (const auto *T = dyn_cast<UnaryOperator>(E))
+    return ignoreTemporariesAndPointers(T->getSubExpr());
+
+  const Expr *F = E->IgnoreImplicit();
+  if (E != F)
+    return ignoreTemporariesAndPointers(F);
+
+  return E;
+}
+
+static const Expr *ignoreTemporariesAndConstructors(const Expr *E) {
+  if (const auto *T = dyn_cast<CXXConstructExpr>(E))
+    return ignoreTemporariesAndConstructors(T->getArg(0));
+
+  const Expr *F = E->IgnoreImplicit();
+  if (E != F)
+    return ignoreTemporariesAndPointers(F);
+
+  return E;
+}
+
+static StringRef getSourceTextForExpr(const MatchFinder::MatchResult &Result,
+                                      const Expr *E) {
+  return Lexer::getSourceText(
+      CharSourceRange::getTokenRange(E->getBeginLoc(), E->getEndLoc()),
+      *Result.SourceManager, Result.Context->getLangOpts());
+}
+
+static bool isCallExprNamed(const Expr *E, StringRef Name) {
+  const auto *CE = dyn_cast<CallExpr>(E->IgnoreImplicit());
+  if (!CE)
+    return false;
+  const auto *ND = dyn_cast<NamedDecl>(CE->getCalleeDecl());
+  if (!ND)
+    return false;
+  return ND->getQualifiedNameAsString() == Name;
+}
+
+static void
+initializeBindArgumentForCallExpr(const MatchFinder::MatchResult &Result,
+                                  BindArgument &B, const CallExpr *CE,
+                                  unsigned &CaptureIndex) {
+  // std::ref(x) means to capture x by reference.
+  if (isCallExprNamed(CE, "boost::ref") || isCallExprNamed(CE, "std::ref")) {
+    B.Kind = BK_Other;
+    B.CM = CM_ByRef;
+    B.UsageIdentifier = getSourceTextForExpr(Result, CE->getArg(0));
+  } else {
+    B.Kind = BK_CallExpr;
+    B.CM = CM_InitExpression;
+    B.UsageIdentifier = "capture" + llvm::utostr(CaptureIndex++);
+  }
+  B.CaptureIdentifier = B.UsageIdentifier;
+}
+
+static bool anyDescendantIsLocal(const Stmt *Statement) {
+  if (const auto *DeclRef = dyn_cast<DeclRefExpr>(Statement)) {
+    const ValueDecl *Decl = DeclRef->getDecl();
+    if (const auto *Var = dyn_cast_or_null<VarDecl>(Decl)) {
+      if (Var->isLocalVarDeclOrParm())
+        return true;
+    }
+  } else if (isa<CXXThisExpr>(Statement))
+    return true;
+
+  return any_of(Statement->children(), anyDescendantIsLocal);
+}
+
+static bool tryCaptureAsLocalVariable(const MatchFinder::MatchResult &Result,
+                                      BindArgument &B, const Expr *E) {
+  if (const auto *BTE = dyn_cast<CXXBindTemporaryExpr>(E)) {
+    if (const auto *CE = dyn_cast<CXXConstructExpr>(BTE->getSubExpr()))
+      return tryCaptureAsLocalVariable(Result, B, CE->getArg(0));
+    return false;
+  }
+
+  const auto *DRE = dyn_cast<DeclRefExpr>(E->IgnoreImplicit());
+  if (!DRE)
+    return false;
+
+  const auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+  if (!VD || !VD->isLocalVarDeclOrParm())
+    return false;
+
+  B.CM = CM_ByValue;
+  B.UsageIdentifier = getSourceTextForExpr(Result, E);
+  B.CaptureIdentifier = B.UsageIdentifier;
+  return true;
+}
+
+static bool tryCaptureAsMemberVariable(const MatchFinder::MatchResult &Result,
+                                       BindArgument &B, const Expr *E) {
+  if (const auto *BTE = dyn_cast<CXXBindTemporaryExpr>(E)) {
+    if (const auto *CE = dyn_cast<CXXConstructExpr>(BTE->getSubExpr()))
+      return tryCaptureAsMemberVariable(Result, B, CE->getArg(0));
+    return false;
+  }
+
+  E = E->IgnoreImplicit();
+  if (isa<CXXThisExpr>(E)) {
+    B.CM = CM_ByValue;
+    B.UsageIdentifier = getSourceTextForExpr(Result, E);
+    B.CaptureIdentifier = "this";
+    return true;
+  }
+
+  const auto *ME = dyn_cast<MemberExpr>(E);
+  if (!ME)
+    return false;
+
+  if (!ME->isLValue() || !isa<FieldDecl>(ME->getMemberDecl()))
+    return false;
+
+  B.CM = CM_ByValue;
+  B.UsageIdentifier = getSourceTextForExpr(Result, E);
+  B.CaptureIdentifier = "this";
+  return true;
+}
+
 static SmallVector<BindArgument, 4>
-buildBindArguments(const MatchFinder::MatchResult &Result, const CallExpr *C) {
+buildBindArguments(const MatchFinder::MatchResult &Result,
+                   const CallableInfo &Callable) {
   SmallVector<BindArgument, 4> BindArguments;
   llvm::Regex MatchPlaceholder("^_([0-9]+)$");
 
+  const auto *BindCall = Result.Nodes.getNodeAs<CallExpr>("bind");
+
   // Start at index 1 as first argument to bind is the function name.
-  for (size_t I = 1, ArgCount = C->getNumArgs(); I < ArgCount; ++I) {
-    const Expr *E = C->getArg(I);
-    BindArgument B;
-    if (const auto *M = dyn_cast<MaterializeTemporaryExpr>(E)) {
-      const auto *TE = M->getSubExpr();
-      B.Kind = isa<CallExpr>(TE) ? BK_CallExpr : BK_Temporary;
-    }
+  unsigned CaptureIndex = 0;
+  for (size_t I = 1, ArgCount = BindCall->getNumArgs(); I < ArgCount; ++I) {
+
+    const Expr *E = BindCall->getArg(I);
+    BindArgument &B = BindArguments.emplace_back();
+
+    size_t ArgIndex = I - 1;
+    if (Callable.Type == CT_MemberFunction)
+      --ArgIndex;
+
+    bool IsObjectPtr = (I == 1 && Callable.Type == CT_MemberFunction);
+    B.E = E;
+    B.SourceTokens = getSourceTextForExpr(Result, E);
 
-    B.Tokens = Lexer::getSourceText(
-        CharSourceRange::getTokenRange(E->getBeginLoc(), E->getEndLoc()),
-        *Result.SourceManager, Result.Context->getLangOpts());
+    if (!Callable.Decl || ArgIndex < Callable.Decl->getNumParams() ||
+        IsObjectPtr)
+      B.IsUsed = true;
 
     SmallVector<StringRef, 2> Matches;
-    if (B.Kind == BK_Other && MatchPlaceholder.match(B.Tokens, &Matches)) {
+    if (MatchPlaceholder.match(B.SourceTokens, &Matches)) {
       B.Kind = BK_Placeholder;
       B.PlaceHolderIndex = std::stoi(Matches[1]);
+      B.UsageIdentifier = "PH" + llvm::utostr(B.PlaceHolderIndex);
+      B.CaptureIdentifier = B.UsageIdentifier;
+      continue;
+    }
+
+    if (const auto *CE =
+            dyn_cast<CallExpr>(ignoreTemporariesAndConstructors(E))) {
+      initializeBindArgumentForCallExpr(Result, B, CE, CaptureIndex);
+      continue;
+    }
+
+    if (tryCaptureAsLocalVariable(Result, B, B.E) ||
+        tryCaptureAsMemberVariable(Result, B, B.E))
+      continue;
+
+    // If it's not something we recognize, capture it by init expression to be
+    // safe.
+    B.Kind = BK_Other;
+    if (IsObjectPtr) {
+      B.CM = CM_InitExpression;
+      B.UsageIdentifier = "ObjectPtr";
+      B.CaptureIdentifier = B.UsageIdentifier;
+    } else if (anyDescendantIsLocal(B.E)) {
+      B.CM = CM_InitExpression;
+      B.CaptureIdentifier = "capture" + llvm::utostr(CaptureIndex++);
+      B.UsageIdentifier = B.CaptureIdentifier;
     }
-    BindArguments.push_back(B);
   }
   return BindArguments;
 }
 
-static void addPlaceholderArgs(const ArrayRef<BindArgument> Args,
-                               llvm::raw_ostream &Stream) {
+static int findPositionOfPlaceholderUse(ArrayRef<BindArgument> Args,
+                                        size_t PlaceholderIndex) {
+  for (size_t I = 0; I < Args.size(); ++I)
+    if (Args[I].PlaceHolderIndex == PlaceholderIndex)
+      return I;
+
+  return -1;
+}
+
+static void addPlaceholderArgs(const LambdaProperties &LP,
+                               llvm::raw_ostream &Stream,
+                               bool PermissiveParameterList) {
+
+  ArrayRef<BindArgument> Args = LP.BindArguments;
+
   auto MaxPlaceholderIt =
       std::max_element(Args.begin(), Args.end(),
                        [](const BindArgument &B1, const BindArgument &B2) {
@@ -80,27 +306,41 @@ static void addPlaceholderArgs(const ArrayRef<BindArgument> Args,
                        });
 
   // Placeholders (if present) have index 1 or greater.
-  if (MaxPlaceholderIt == Args.end() || MaxPlaceholderIt->PlaceHolderIndex == 0)
+  if (!PermissiveParameterList && (MaxPlaceholderIt == Args.end() ||
+                                   MaxPlaceholderIt->PlaceHolderIndex == 0))
     return;
 
   size_t PlaceholderCount = MaxPlaceholderIt->PlaceHolderIndex;
   Stream << "(";
   StringRef Delimiter = "";
   for (size_t I = 1; I <= PlaceholderCount; ++I) {
-    Stream << Delimiter << "auto && arg" << I;
+    Stream << Delimiter << "auto &&";
+
+    int ArgIndex = findPositionOfPlaceholderUse(Args, I);
+
+    if (ArgIndex != -1 && Args[ArgIndex].IsUsed)
+      Stream << " " << Args[ArgIndex].UsageIdentifier;
     Delimiter = ", ";
   }
+  if (PermissiveParameterList)
+    Stream << Delimiter << "auto && ...";
   Stream << ")";
 }
 
-static void addFunctionCallArgs(const ArrayRef<BindArgument> Args,
+static void addFunctionCallArgs(ArrayRef<BindArgument> Args,
                                 llvm::raw_ostream &Stream) {
   StringRef Delimiter = "";
-  for (const auto &B : Args) {
-    if (B.PlaceHolderIndex)
-      Stream << Delimiter << "arg" << B.PlaceHolderIndex;
-    else
-      Stream << Delimiter << B.Tokens;
+
+  for (int I = 0, Size = Args.size(); I < Size; ++I) {
+    const BindArgument &B = Args[I];
+
+    Stream << Delimiter;
+
+    if (B.Kind == BK_Placeholder || B.CM != CM_None)
+      Stream << B.UsageIdentifier;
+    else if (B.CM == CM_None)
+      Stream << B.SourceTokens;
+
     Delimiter = ", ";
   }
 }
@@ -116,59 +356,301 @@ static bool isPlaceHolderIndexRepeated(const ArrayRef<BindArgument> Args) {
   return false;
 }
 
+static std::vector<const CXXMethodDecl *>
+findCandidateCallOperators(const CXXRecordDecl *RecordDecl, size_t NumArgs) {
+  std::vector<const CXXMethodDecl *> Candidates;
+
+  for (const clang::CXXMethodDecl *Method : RecordDecl->methods()) {
+    OverloadedOperatorKind OOK = Method->getOverloadedOperator();
+
+    if (OOK != OverloadedOperatorKind::OO_Call)
+      continue;
+
+    if (Method->getNumParams() > NumArgs)
+      continue;
+
+    Candidates.push_back(Method);
+  }
+
+  return Candidates;
+}
+
+static bool isFixitSupported(const CallableInfo &Callee,
+                             ArrayRef<BindArgument> Args) {
+  // Do not attempt to create fixits for nested std::bind or std::ref.
+  // Supporting nested std::bind will be more difficult due to placeholder
+  // sharing between outer and inner std::bind invocations, and std::ref
+  // requires us to capture some parameters by reference instead of by value.
+  if (any_of(Args, [](const BindArgument &B) {
+        return isCallExprNamed(B.E, "boost::bind") ||
+               isCallExprNamed(B.E, "std::bind");
+      })) {
+    return false;
+  }
+
+  // Do not attempt to create fixits when placeholders are reused.
+  // Unused placeholders are supported by requiring C++14 generic lambdas.
+  // FIXME: Support this case by deducing the common type.
+  if (isPlaceHolderIndexRepeated(Args))
+    return false;
+
+  // If we can't determine the Decl being used, don't offer a fixit.
+  if (!Callee.Decl)
+    return false;
+
+  if (Callee.Type == CT_Other || Callee.Materialization == CMK_Other)
+    return false;
+
+  return true;
+}
+
+const FunctionDecl *getCallOperator(const CXXRecordDecl *Callable,
+                                    size_t NumArgs) {
+  std::vector<const CXXMethodDecl *> Candidates =
+      findCandidateCallOperators(Callable, NumArgs);
+  if (Candidates.size() != 1)
+    return nullptr;
+
+  return Candidates.front();
+}
+
+const FunctionDecl *
+getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type,
+                  CallableMaterializationKind Materialization) {
+
+  const Expr *Callee = Result.Nodes.getNodeAs<Expr>("ref");
+  const Expr *CallExpression = ignoreTemporariesAndPointers(Callee);
+
+  if (Type == CT_Object) {
+    const auto *BindCall = Result.Nodes.getNodeAs<CallExpr>("bind");
+    size_t NumArgs = BindCall->getNumArgs() - 1;
+    return getCallOperator(Callee->getType()->getAsCXXRecordDecl(), NumArgs);
+  }
+
+  if (Materialization == CMK_Function) {
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(CallExpression))
+      return dyn_cast<FunctionDecl>(DRE->getDecl());
+  }
+
+  // Maybe this is an indirect call through a function pointer or something
+  // where we can't determine the exact decl.
+  return nullptr;
+}
+
+static CallableType getCallableType(const MatchFinder::MatchResult &Result) {
+  const auto *CallableExpr = Result.Nodes.getNodeAs<Expr>("ref");
+
+  QualType QT = CallableExpr->getType();
+  if (QT->isMemberFunctionPointerType())
+    return CT_MemberFunction;
+
+  if (QT->isFunctionPointerType() || QT->isFunctionReferenceType() ||
+      QT->isFunctionType())
+    return CT_Function;
+
+  if (QT->isRecordType()) {
+    const CXXRecordDecl *Decl = QT->getAsCXXRecordDecl();
+    if (!Decl)
+      return CT_Other;
+
+    return CT_Object;
+  }
+
+  return CT_Other;
+}
+
+static CallableMaterializationKind
+getCallableMaterialization(const MatchFinder::MatchResult &Result) {
+  const auto *CallableExpr = Result.Nodes.getNodeAs<Expr>("ref");
+
+  const auto *NoTemporaries = ignoreTemporariesAndPointers(CallableExpr);
+
+  if (isa<CallExpr>(NoTemporaries))
+    return CMK_CallExpression;
+
+  if (isa<CXXFunctionalCastExpr>(NoTemporaries) ||
+      isa<CXXConstructExpr>(NoTemporaries))
+    return CMK_Function;
+
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(NoTemporaries)) {
+    if (isa<FunctionDecl>(DRE->getDecl()))
+      return CMK_Function;
+    if (isa<VarDecl>(DRE->getDecl()))
+      return CMK_VariableRef;
+  }
+
+  return CMK_Other;
+}
+
+static LambdaProperties
+getLambdaProperties(const MatchFinder::MatchResult &Result) {
+  const auto *CalleeExpr = Result.Nodes.getNodeAs<Expr>("ref");
+
+  LambdaProperties LP;
+
+  const auto *Bind = Result.Nodes.getNodeAs<CallExpr>("bind");
+  const auto *Decl = dyn_cast<FunctionDecl>(Bind->getCalleeDecl());
+  const auto *NS =
+      dyn_cast<NamespaceDecl>(Decl->getEnclosingNamespaceContext());
+  while (NS->isInlineNamespace())
+    NS = dyn_cast<NamespaceDecl>(NS->getDeclContext());
+  LP.BindNamespace = NS->getName();
+
+  LP.Callable.Type = getCallableType(Result);
+  LP.Callable.Materialization = getCallableMaterialization(Result);
+  LP.Callable.Decl =
+      getCallMethodDecl(Result, LP.Callable.Type, LP.Callable.Materialization);
+  LP.Callable.SourceTokens = getSourceTextForExpr(Result, CalleeExpr);
+  if (LP.Callable.Materialization == CMK_VariableRef) {
+    LP.Callable.CM = CM_ByValue;
+    LP.Callable.UsageIdentifier = getSourceTextForExpr(Result, CalleeExpr);
+    LP.Callable.CaptureIdentifier =
+        getSourceTextForExpr(Result, ignoreTemporariesAndPointers(CalleeExpr));
+  } else if (LP.Callable.Materialization == CMK_CallExpression) {
+    LP.Callable.CM = CM_InitExpression;
+    LP.Callable.UsageIdentifier = "Func";
+    LP.Callable.CaptureIdentifier = "Func";
+    LP.Callable.CaptureInitializer = getSourceTextForExpr(Result, CalleeExpr);
+  }
+
+  LP.BindArguments = buildBindArguments(Result, LP.Callable);
+
+  LP.IsFixitSupported = isFixitSupported(LP.Callable, LP.BindArguments);
+
+  return LP;
+}
+
+static bool emitCapture(llvm::StringSet<> &CaptureSet, StringRef Delimiter,
+                        CaptureMode CM, StringRef Identifier,
+                        StringRef InitExpression, raw_ostream &Stream) {
+  if (CM == CM_None)
+    return false;
+
+  // This capture has already been emitted.
+  if (CaptureSet.count(Identifier) != 0)
+    return false;
+
+  Stream << Delimiter;
+
+  if (CM == CM_ByRef)
+    Stream << "&";
+  Stream << Identifier;
+  if (CM == CM_InitExpression)
+    Stream << " = " << InitExpression;
+
+  CaptureSet.insert(Identifier);
+  return true;
+}
+
+static void emitCaptureList(const LambdaProperties &LP,
+                            const MatchFinder::MatchResult &Result,
+                            raw_ostream &Stream) {
+  llvm::StringSet<> CaptureSet;
+  bool AnyCapturesEmitted = false;
+
+  AnyCapturesEmitted = emitCapture(CaptureSet, "", LP.Callable.CM,
+                                   LP.Callable.CaptureIdentifier,
+                                   LP.Callable.CaptureInitializer, Stream);
+
+  for (const BindArgument &B : LP.BindArguments) {
+    if (B.CM == CM_None || !B.IsUsed)
+      continue;
+
+    StringRef Delimiter = AnyCapturesEmitted ? ", " : "";
+
+    if (emitCapture(CaptureSet, Delimiter, B.CM, B.CaptureIdentifier,
+                    B.SourceTokens, Stream))
+      AnyCapturesEmitted = true;
+  }
+}
+
+static ArrayRef<BindArgument>
+getForwardedArgumentList(const LambdaProperties &P) {
+  ArrayRef<BindArgument> Args = makeArrayRef(P.BindArguments);
+  if (P.Callable.Type != CT_MemberFunction)
+    return Args;
+
+  return Args.drop_front();
+}
+AvoidBindCheck::AvoidBindCheck(StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      PermissiveParameterList(Options.get("PermissiveParameterList", 0) != 0) {}
+
 void AvoidBindCheck::registerMatchers(MatchFinder *Finder) {
   if (!getLangOpts().CPlusPlus14) // Need C++14 for generic lambdas.
     return;
 
   Finder->addMatcher(
       callExpr(
-          callee(namedDecl(hasName("::std::bind"))),
-          hasArgument(0, declRefExpr(to(functionDecl().bind("f"))).bind("ref")))
+          callee(namedDecl(
+              anyOf(hasName("::boost::bind"), hasName("::std::bind")))),
+          hasArgument(
+              0, anyOf(expr(hasType(memberPointerType())).bind("ref"),
+                       expr(hasParent(materializeTemporaryExpr().bind("ref"))),
+                       expr().bind("ref"))))
           .bind("bind"),
       this);
 }
 
 void AvoidBindCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedDecl = Result.Nodes.getNodeAs<CallExpr>("bind");
-  auto Diag = diag(MatchedDecl->getBeginLoc(), "prefer a lambda to std::bind");
-
-  const auto Args = buildBindArguments(Result, MatchedDecl);
 
-  // Do not attempt to create fixits for nested call expressions.
-  // FIXME: Create lambda capture variables to capture output of calls.
-  // NOTE: Supporting nested std::bind will be more difficult due to placeholder
-  // sharing between outer and inner std:bind invocations.
-  if (llvm::any_of(Args,
-                   [](const BindArgument &B) { return B.Kind == BK_CallExpr; }))
-    return;
-
-  // Do not attempt to create fixits when placeholders are reused.
-  // Unused placeholders are supported by requiring C++14 generic lambdas.
-  // FIXME: Support this case by deducing the common type.
-  if (isPlaceHolderIndexRepeated(Args))
+  LambdaProperties LP = getLambdaProperties(Result);
+  auto Diag =
+      diag(MatchedDecl->getBeginLoc(),
+           formatv("prefer a lambda to {0}::bind", LP.BindNamespace).str());
+  if (!LP.IsFixitSupported)
     return;
 
-  const auto *F = Result.Nodes.getNodeAs<FunctionDecl>("f");
-
-  // std::bind can support argument count mismatch between its arguments and the
-  // bound function's arguments. Do not attempt to generate a fixit for such
-  // cases.
-  // FIXME: Support this case by creating unused lambda capture variables.
-  if (F->getNumParams() != Args.size())
-    return;
+  const auto *Ref = Result.Nodes.getNodeAs<Expr>("ref");
 
   std::string Buffer;
   llvm::raw_string_ostream Stream(Buffer);
 
-  bool HasCapturedArgument = llvm::any_of(
-      Args, [](const BindArgument &B) { return B.Kind == BK_Other; });
-  const auto *Ref = Result.Nodes.getNodeAs<DeclRefExpr>("ref");
-  Stream << "[" << (HasCapturedArgument ? "=" : "") << "]";
-  addPlaceholderArgs(Args, Stream);
-  Stream << " { return ";
-  Ref->printPretty(Stream, nullptr, Result.Context->getPrintingPolicy());
+  Stream << "[";
+  emitCaptureList(LP, Result, Stream);
+  Stream << "]";
+
+  ArrayRef<BindArgument> FunctionCallArgs = makeArrayRef(LP.BindArguments);
+
+  addPlaceholderArgs(LP, Stream, PermissiveParameterList);
+
+  if (LP.Callable.Type == CT_Function) {
+    StringRef SourceTokens = LP.Callable.SourceTokens;
+    SourceTokens.consume_front("&");
+    Stream << " { return " << SourceTokens;
+  } else if (LP.Callable.Type == CT_MemberFunction) {
+    const auto *MethodDecl = dyn_cast<CXXMethodDecl>(LP.Callable.Decl);
+    const BindArgument &ObjPtr = FunctionCallArgs.front();
+
+    Stream << " { ";
+    if (!isa<CXXThisExpr>(ignoreTemporariesAndPointers(ObjPtr.E))) {
+      Stream << ObjPtr.UsageIdentifier;
+      Stream << "->";
+    }
+
+    Stream << MethodDecl->getName();
+  } else {
+    Stream << " { return ";
+    switch (LP.Callable.CM) {
+    case CM_ByValue:
+    case CM_ByRef:
+      if (LP.Callable.UsageIdentifier != LP.Callable.CaptureIdentifier) {
+        Stream << "(" << LP.Callable.UsageIdentifier << ")";
+        break;
+      }
+      LLVM_FALLTHROUGH;
+    case CM_InitExpression:
+      Stream << LP.Callable.UsageIdentifier;
+      break;
+    default:
+      Ref->printPretty(Stream, nullptr, Result.Context->getPrintingPolicy());
+    }
+  }
+
   Stream << "(";
-  addFunctionCallArgs(Args, Stream);
+
+  addFunctionCallArgs(getForwardedArgumentList(LP), Stream);
   Stream << "); }";
 
   Diag << FixItHint::CreateReplacement(MatchedDecl->getSourceRange(),
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
index 4b393303b7ef0..5576fe6c3bd5d 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.h
@@ -23,10 +23,12 @@ namespace modernize {
 /// http://clang.llvm.org/extra/clang-tidy/checks/modernize-avoid-std-bind.html
 class AvoidBindCheck : public ClangTidyCheck {
 public:
-  AvoidBindCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  AvoidBindCheck(StringRef Name, ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  bool PermissiveParameterList = false;
 };
 } // namespace modernize
 } // namespace tidy
diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
index a2a56241e8ab6..eb3d7c505b831 100644
--- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.cpp
@@ -19,44 +19,6 @@ namespace clang {
 namespace tidy {
 namespace readability {
 
-namespace {
-class NamespaceCommentPPCallbacks : public PPCallbacks {
-public:
-  NamespaceCommentPPCallbacks(Preprocessor *PP, NamespaceCommentCheck *Check)
-      : PP(PP), Check(Check) {}
-
-  void MacroDefined(const Token &MacroNameTok, const MacroDirective *MD) {
-    // Record all defined macros. We store the whole token to compare names
-    // later.
-
-    const MacroInfo * MI = MD->getMacroInfo();
-
-    if (MI->isFunctionLike())
-      return;
-
-    std::string ValueBuffer;
-    llvm::raw_string_ostream Value(ValueBuffer);
-
-    SmallString<128> SpellingBuffer;
-    bool First = true;
-    for (const auto &T : MI->tokens()) {
-      if (!First && T.hasLeadingSpace())
-        Value << ' ';
-
-      Value << PP->getSpelling(T, SpellingBuffer);
-      First = false;
-    }
-
-    Check->addMacro(MacroNameTok.getIdentifierInfo()->getName().str(),
-                    Value.str());
-  }
-
-private:
-  Preprocessor *PP;
-  NamespaceCommentCheck *Check;
-};
-} // namespace
-
 NamespaceCommentCheck::NamespaceCommentCheck(StringRef Name,
                                              ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
@@ -78,37 +40,24 @@ void NamespaceCommentCheck::registerMatchers(MatchFinder *Finder) {
     Finder->addMatcher(namespaceDecl().bind("namespace"), this);
 }
 
-void NamespaceCommentCheck::registerPPCallbacks(
-    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
-  PP->addPPCallbacks(std::make_unique<NamespaceCommentPPCallbacks>(PP, this));
-}
-
 static bool locationsInSameFile(const SourceManager &Sources,
                                 SourceLocation Loc1, SourceLocation Loc2) {
   return Loc1.isFileID() && Loc2.isFileID() &&
          Sources.getFileID(Loc1) == Sources.getFileID(Loc2);
 }
 
-std::string NamespaceCommentCheck::getNamespaceComment(const NamespaceDecl *ND,
-                                                       bool InsertLineBreak) {
+static std::string getNamespaceComment(const NamespaceDecl *ND,
+                                       bool InsertLineBreak) {
   std::string Fix = "// namespace";
-  if (!ND->isAnonymousNamespace()) {
-    bool IsNamespaceMacroExpansion;
-    StringRef MacroDefinition;
-    std::tie(IsNamespaceMacroExpansion, MacroDefinition) =
-        isNamespaceMacroExpansion(ND->getName());
-
-    Fix.append(" ").append(IsNamespaceMacroExpansion ? MacroDefinition
-                                                     : ND->getName());
-  }
+  if (!ND->isAnonymousNamespace())
+    Fix.append(" ").append(ND->getNameAsString());
   if (InsertLineBreak)
     Fix.append("\n");
   return Fix;
 }
 
-std::string
-NamespaceCommentCheck::getNamespaceComment(const std::string &NameSpaceName,
-                                           bool InsertLineBreak) {
+static std::string getNamespaceComment(const std::string &NameSpaceName,
+                                       bool InsertLineBreak) {
   std::string Fix = "// namespace ";
   Fix.append(NameSpaceName);
   if (InsertLineBreak)
@@ -116,32 +65,6 @@ NamespaceCommentCheck::getNamespaceComment(const std::string &NameSpaceName,
   return Fix;
 }
 
-void NamespaceCommentCheck::addMacro(const std::string &Name,
-                                     const std::string &Value) noexcept {
-  Macros.emplace_back(Name, Value);
-}
-
-bool NamespaceCommentCheck::isNamespaceMacroDefinition(
-    const StringRef NameSpaceName) {
-  return llvm::any_of(Macros, [&NameSpaceName](const auto &Macro) {
-    return NameSpaceName == Macro.first;
-  });
-}
-
-std::tuple<bool, StringRef> NamespaceCommentCheck::isNamespaceMacroExpansion(
-    const StringRef NameSpaceName) {
-  const auto &MacroIt =
-      llvm::find_if(Macros, [&NameSpaceName](const auto &Macro) {
-        return NameSpaceName == Macro.second;
-      });
-
-  const bool IsNamespaceMacroExpansion = Macros.end() != MacroIt;
-
-  return std::make_tuple(IsNamespaceMacroExpansion,
-                         IsNamespaceMacroExpansion ? StringRef(MacroIt->first)
-                                                   : NameSpaceName);
-}
-
 void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *ND = Result.Nodes.getNodeAs<NamespaceDecl>("namespace");
   const SourceManager &Sources = *Result.SourceManager;
@@ -220,48 +143,28 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
       StringRef NamespaceNameInComment = Groups.size() > 5 ? Groups[5] : "";
       StringRef Anonymous = Groups.size() > 3 ? Groups[3] : "";
 
-      // Don't allow to use macro expansion in closing comment.
-      // FIXME: Use Structured Bindings once C++17 features will be enabled.
-      bool IsNamespaceMacroExpansion;
-      StringRef MacroDefinition;
-      std::tie(IsNamespaceMacroExpansion, MacroDefinition) =
-          isNamespaceMacroExpansion(NamespaceNameInComment);
-
       if (IsNested && NestedNamespaceName == NamespaceNameInComment) {
         // C++17 nested namespace.
         return;
       } else if ((ND->isAnonymousNamespace() &&
                   NamespaceNameInComment.empty()) ||
-                 (((ND->getNameAsString() == NamespaceNameInComment) &&
-                   Anonymous.empty()) &&
-                  !IsNamespaceMacroExpansion)) {
+                 (ND->getNameAsString() == NamespaceNameInComment &&
+                  Anonymous.empty())) {
         // Check if the namespace in the comment is the same.
         // FIXME: Maybe we need a strict mode, where we always fix namespace
         // comments with different format.
         return;
       }
 
-      // Allow using macro definitions in closing comment.
-      if (isNamespaceMacroDefinition(NamespaceNameInComment))
-        return;
-
       // Otherwise we need to fix the comment.
       NeedLineBreak = Comment.startswith("/*");
       OldCommentRange =
           SourceRange(AfterRBrace, Loc.getLocWithOffset(Tok.getLength()));
-
-      if (IsNamespaceMacroExpansion) {
-        Message = (llvm::Twine("%0 ends with a comment that refers to an "
-                               "expansion of macro"))
-                      .str();
-        NestedNamespaceName = MacroDefinition;
-      } else {
-        Message = (llvm::Twine("%0 ends with a comment that refers to a "
-                               "wrong namespace '") +
-                   NamespaceNameInComment + "'")
-                      .str();
-      }
-
+      Message =
+          (llvm::Twine(
+               "%0 ends with a comment that refers to a wrong namespace '") +
+           NamespaceNameInComment + "'")
+              .str();
     } else if (Comment.startswith("//")) {
       // Assume that this is an unrecognized form of a namespace closing line
       // comment. Replace it.
@@ -274,16 +177,6 @@ void NamespaceCommentCheck::check(const MatchFinder::MatchResult &Result) {
     // multi-line or there may be other tokens behind it.
   }
 
-  // Print Macro definition instead of expansion.
-  // FIXME: Use Structured Bindings once C++17 features will be enabled.
-  bool IsNamespaceMacroExpansion;
-  StringRef MacroDefinition;
-  std::tie(IsNamespaceMacroExpansion, MacroDefinition) =
-      isNamespaceMacroExpansion(NestedNamespaceName);
-
-  if (IsNamespaceMacroExpansion)
-    NestedNamespaceName = MacroDefinition;
-
   std::string NamespaceName =
       ND->isAnonymousNamespace()
           ? "anonymous namespace"
diff --git a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
index bc5c11e7b71b9..712cd4662965e 100644
--- a/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/NamespaceCommentCheck.h
@@ -26,29 +26,14 @@ class NamespaceCommentCheck : public ClangTidyCheck {
   NamespaceCommentCheck(StringRef Name, ClangTidyContext *Context);
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
-  void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP,
-                           Preprocessor *ModuleExpanderPP) override;
-
-  void addMacro(const std::string &Name, const std::string &Value) noexcept;
 
 private:
   void storeOptions(ClangTidyOptions::OptionMap &Options) override;
-  std::string getNamespaceComment(const NamespaceDecl *ND,
-                                  bool InsertLineBreak);
-  std::string getNamespaceComment(const std::string &NameSpaceName,
-                                  bool InsertLineBreak);
-  bool isNamespaceMacroDefinition(const StringRef NameSpaceName);
-  std::tuple<bool, StringRef>
-  isNamespaceMacroExpansion(const StringRef NameSpaceName);
 
   llvm::Regex NamespaceCommentPattern;
   const unsigned ShortNamespaceLines;
   const unsigned SpacesBeforeComments;
   llvm::SmallVector<SourceLocation, 4> Ends;
-
-  // Store macros to verify that warning is not thrown when namespace name is a
-  // preprocessed define.
-  std::vector<std::pair<std::string, std::string>> Macros;
 };
 
 } // namespace readability
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index df83de856238f..ad6182def20d2 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -18,6 +18,7 @@
 #include "../ClangTidyForceLinker.h"
 #include "../GlobList.h"
 #include "clang/Tooling/CommonOptionsParser.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
@@ -327,7 +328,7 @@ getVfsFromFile(const std::string &OverlayFile,
 }
 
 static int clangTidyMain(int argc, const char **argv) {
-  llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
+  llvm::InitLLVM X(argc, argv);
   CommonOptionsParser OptionsParser(argc, argv, ClangTidyCategory,
                                     cl::ZeroOrMore);
   llvm::IntrusiveRefCntPtr<vfs::OverlayFileSystem> BaseFS(
diff --git a/clang-tools-extra/clangd/CMakeLists.txt b/clang-tools-extra/clangd/CMakeLists.txt
index 8ab2ae6b91d3a..c1aea3bd119d1 100644
--- a/clang-tools-extra/clangd/CMakeLists.txt
+++ b/clang-tools-extra/clangd/CMakeLists.txt
@@ -69,6 +69,7 @@ add_clang_library(clangDaemon
   Selection.cpp
   SemanticHighlighting.cpp
   SemanticSelection.cpp
+  Shutdown.cpp
   SourceCode.cpp
   QueryDriverDatabase.cpp
   Threading.cpp
diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp
index 4fe8158180749..57ed97f7a7825 100644
--- a/clang-tools-extra/clangd/ClangdLSPServer.cpp
+++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp
@@ -103,13 +103,13 @@ std::vector<std::vector<std::string>> buildHighlightScopeLookupTable() {
   return LookupTable;
 }
 
-// Makes sure edits in \p E are applicable to latest file contents reported by
+// Makes sure edits in \p FE are applicable to latest file contents reported by
 // editor. If not generates an error message containing information about files
 // that needs to be saved.
-llvm::Error validateEdits(const DraftStore &DraftMgr, const Tweak::Effect &E) {
+llvm::Error validateEdits(const DraftStore &DraftMgr, const FileEdits &FE) {
   size_t InvalidFileCount = 0;
   llvm::StringRef LastInvalidFile;
-  for (const auto &It : E.ApplyEdits) {
+  for (const auto &It : FE) {
     if (auto Draft = DraftMgr.getDraft(It.first())) {
       // If the file is open in user's editor, make sure the version we
       // saw and current version are compatible as this is the text that
@@ -704,7 +704,7 @@ void ClangdLSPServer::onCommand(const ExecuteCommandParams &Params,
       if (R->ApplyEdits.empty())
         return Reply("Tweak applied.");
 
-      if (auto Err = validateEdits(DraftMgr, *R))
+      if (auto Err = validateEdits(DraftMgr, R->ApplyEdits))
         return Reply(std::move(Err));
 
       WorkspaceEdit WE;
@@ -758,17 +758,23 @@ void ClangdLSPServer::onRename(const RenameParams &Params,
   if (!Code)
     return Reply(llvm::make_error<LSPError>(
         "onRename called for non-added file", ErrorCode::InvalidParams));
-
-  Server->rename(File, Params.position, Params.newName, /*WantFormat=*/true,
-                 [File, Code, Params, Reply = std::move(Reply)](
-                     llvm::Expected<std::vector<TextEdit>> Edits) mutable {
-                   if (!Edits)
-                     return Reply(Edits.takeError());
-
-                   WorkspaceEdit WE;
-                   WE.changes = {{Params.textDocument.uri.uri(), *Edits}};
-                   Reply(WE);
-                 });
+  Server->rename(
+      File, Params.position, Params.newName,
+      /*WantFormat=*/true,
+      [File, Params, Reply = std::move(Reply),
+       this](llvm::Expected<FileEdits> Edits) mutable {
+        if (!Edits)
+          return Reply(Edits.takeError());
+        if (auto Err = validateEdits(DraftMgr, *Edits))
+          return Reply(std::move(Err));
+        WorkspaceEdit Result;
+        Result.changes.emplace();
+        for (const auto &Rep : *Edits) {
+          (*Result.changes)[URI::createFile(Rep.first()).toString()] =
+              Rep.second.asTextEdits();
+        }
+        Reply(Result);
+      });
 }
 
 void ClangdLSPServer::onDocumentDidClose(
diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp
index 5a9833d78b48e..e9e03dbc37426 100644
--- a/clang-tools-extra/clangd/ClangdServer.cpp
+++ b/clang-tools-extra/clangd/ClangdServer.cpp
@@ -119,7 +119,8 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB,
                      : nullptr),
       GetClangTidyOptions(Opts.GetClangTidyOptions),
       SuggestMissingIncludes(Opts.SuggestMissingIncludes),
-      TweakFilter(Opts.TweakFilter), WorkspaceRoot(Opts.WorkspaceRoot),
+      CrossFileRename(Opts.CrossFileRename), TweakFilter(Opts.TweakFilter),
+      WorkspaceRoot(Opts.WorkspaceRoot),
       // Pass a callback into `WorkScheduler` to extract symbols from a newly
       // parsed file and rebuild the file index synchronously each time an AST
       // is parsed.
@@ -308,54 +309,68 @@ void ClangdServer::prepareRename(PathRef File, Position Pos,
     if (!InpAST)
       return CB(InpAST.takeError());
     auto &AST = InpAST->AST;
-    // Performing the rename isn't substantially more expensive than doing an
-    // AST-based check, so we just rename and throw away the results. We may
-    // have to revisit this when we support cross-file rename.
-    auto Changes = renameWithinFile(AST, File, Pos, "dummy", Index);
+    const auto &SM = AST.getSourceManager();
+    SourceLocation Loc =
+        SM.getMacroArgExpandedLocation(getBeginningOfIdentifier(
+            Pos, AST.getSourceManager(), AST.getLangOpts()));
+    auto Range = getTokenRange(SM, AST.getLangOpts(), Loc);
+    if (!Range)
+      return CB(llvm::None); // "rename" is not valid at the position.
+
+    if (CrossFileRename)
+      // FIXME: we now assume cross-file rename always succeeds, revisit this.
+      return CB(*Range);
+
+    // Performing the local rename isn't substantially more expensive than
+    // doing an AST-based check, so we just rename and throw away the results.
+    auto Changes = clangd::rename({Pos, "dummy", AST, File, Index,
+                                   /*AllowCrossFile=*/false,
+                                   /*GetDirtyBuffer=*/nullptr});
     if (!Changes) {
       // LSP says to return null on failure, but that will result in a generic
       // failure message. If we send an LSP error response, clients can surface
       // the message to users (VSCode does).
       return CB(Changes.takeError());
     }
-    SourceLocation Loc = getBeginningOfIdentifier(
-        Pos, AST.getSourceManager(), AST.getASTContext().getLangOpts());
-    if (auto Range = getTokenRange(AST.getSourceManager(),
-                                   AST.getASTContext().getLangOpts(), Loc))
-      return CB(*Range);
-    // Return null if the "rename" is not valid at the position.
-    CB(llvm::None);
+    return CB(*Range);
   };
   WorkScheduler.runWithAST("PrepareRename", File, std::move(Action));
 }
 
 void ClangdServer::rename(PathRef File, Position Pos, llvm::StringRef NewName,
-                          bool WantFormat, Callback<std::vector<TextEdit>> CB) {
+                          bool WantFormat, Callback<FileEdits> CB) {
+  // A snapshot of all file dirty buffers.
+  llvm::StringMap<std::string> Snapshot = WorkScheduler.getAllFileContents();
   auto Action = [File = File.str(), NewName = NewName.str(), Pos, WantFormat,
-                 CB = std::move(CB),
+                 CB = std::move(CB), Snapshot = std::move(Snapshot),
                  this](llvm::Expected<InputsAndAST> InpAST) mutable {
     if (!InpAST)
       return CB(InpAST.takeError());
-    auto Changes = renameWithinFile(InpAST->AST, File, Pos, NewName, Index);
-    if (!Changes)
-      return CB(Changes.takeError());
+    auto GetDirtyBuffer =
+        [&Snapshot](PathRef AbsPath) -> llvm::Optional<std::string> {
+      auto It = Snapshot.find(AbsPath);
+      if (It == Snapshot.end())
+        return llvm::None;
+      return It->second;
+    };
+    auto Edits = clangd::rename({Pos, NewName, InpAST->AST, File, Index,
+                                 CrossFileRename, GetDirtyBuffer});
+    if (!Edits)
+      return CB(Edits.takeError());
 
     if (WantFormat) {
       auto Style = getFormatStyleForFile(File, InpAST->Inputs.Contents,
                                          InpAST->Inputs.FS.get());
-      if (auto Formatted =
-              cleanupAndFormat(InpAST->Inputs.Contents, *Changes, Style))
-        *Changes = std::move(*Formatted);
-      else
-        elog("Failed to format replacements: {0}", Formatted.takeError());
-    }
+      llvm::Error Err = llvm::Error::success();
+      for (auto &E : *Edits)
+        Err =
+            llvm::joinErrors(reformatEdit(E.getValue(), Style), std::move(Err));
 
-    std::vector<TextEdit> Edits;
-    for (const auto &Rep : *Changes)
-      Edits.push_back(replacementToEdit(InpAST->Inputs.Contents, Rep));
-    return CB(std::move(Edits));
+      if (Err)
+        return CB(std::move(Err));
+    }
+    return CB(std::move(*Edits));
   };
-
   WorkScheduler.runWithAST("Rename", File, std::move(Action));
 }
 
diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h
index cd0b91c08f084..499340808765b 100644
--- a/clang-tools-extra/clangd/ClangdServer.h
+++ b/clang-tools-extra/clangd/ClangdServer.h
@@ -24,6 +24,7 @@
 #include "index/Background.h"
 #include "index/FileIndex.h"
 #include "index/Index.h"
+#include "refactor/Rename.h"
 #include "refactor/Tweak.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "clang/Tooling/Core/Replacement.h"
@@ -133,6 +134,9 @@ class ClangdServer {
     /// Enable semantic highlighting features.
     bool SemanticHighlighting = false;
 
+    /// Enable cross-file rename feature.
+    bool CrossFileRename = false;
+
     /// Returns true if the tweak should be enabled.
     std::function<bool(const Tweak &)> TweakFilter = [](const Tweak &T) {
       return !T.hidden(); // only enable non-hidden tweaks.
@@ -252,7 +256,7 @@ class ClangdServer {
   /// embedders could use this method to get all occurrences of the symbol (e.g.
   /// highlighting them in prepare stage).
   void rename(PathRef File, Position Pos, llvm::StringRef NewName,
-              bool WantFormat, Callback<std::vector<TextEdit>> CB);
+              bool WantFormat, Callback<FileEdits> CB);
 
   struct TweakRef {
     std::string ID;    /// ID to pass for applyTweak.
@@ -327,6 +331,8 @@ class ClangdServer {
   // can be caused by missing includes (e.g. member access in incomplete type).
   bool SuggestMissingIncludes = false;
 
+  bool CrossFileRename = false;
+
   std::function<bool(const Tweak &)> TweakFilter;
 
   // GUARDED_BY(CachedCompletionFuzzyFindRequestMutex)
diff --git a/clang-tools-extra/clangd/Compiler.cpp b/clang-tools-extra/clangd/Compiler.cpp
index 795fd0082594d..eae753b5c9b36 100644
--- a/clang-tools-extra/clangd/Compiler.cpp
+++ b/clang-tools-extra/clangd/Compiler.cpp
@@ -42,7 +42,8 @@ void IgnoreDiagnostics::HandleDiagnostic(DiagnosticsEngine::Level DiagLevel,
 
 std::unique_ptr<CompilerInvocation>
 buildCompilerInvocation(const ParseInputs &Inputs,
-                        clang::DiagnosticConsumer &D) {
+                        clang::DiagnosticConsumer &D,
+                        std::vector<std::string> *CC1Args) {
   std::vector<const char *> ArgStrs;
   for (const auto &S : Inputs.CompileCommand.CommandLine)
     ArgStrs.push_back(S.c_str());
@@ -57,7 +58,7 @@ buildCompilerInvocation(const ParseInputs &Inputs,
       CompilerInstance::createDiagnostics(new DiagnosticOptions, &D, false);
   std::unique_ptr<CompilerInvocation> CI = createInvocationFromCommandLine(
       ArgStrs, CommandLineDiagsEngine, Inputs.FS,
-      /*ShouldRecoverOnErrors=*/true);
+      /*ShouldRecoverOnErrors=*/true, CC1Args);
   if (!CI)
     return nullptr;
   // createInvocationFromCommandLine sets DisableFree.
diff --git a/clang-tools-extra/clangd/Compiler.h b/clang-tools-extra/clangd/Compiler.h
index 6ab1b0f075f93..51414c37fc042 100644
--- a/clang-tools-extra/clangd/Compiler.h
+++ b/clang-tools-extra/clangd/Compiler.h
@@ -52,8 +52,8 @@ struct ParseInputs {
 
 /// Builds compiler invocation that could be used to build AST or preamble.
 std::unique_ptr<CompilerInvocation>
-buildCompilerInvocation(const ParseInputs &Inputs,
-                        clang::DiagnosticConsumer &D);
+buildCompilerInvocation(const ParseInputs &Inputs, clang::DiagnosticConsumer &D,
+                        std::vector<std::string> *CC1Args = nullptr);
 
 /// Creates a compiler instance, configured so that:
 ///   - Contents of the parsed file are remapped to \p MainFile.
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index c536cbf75e5c0..3e55a6a9cdc68 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -175,6 +175,9 @@ struct TargetFinder {
       RelSet Flags;
       Visitor(TargetFinder &Outer, RelSet Flags) : Outer(Outer), Flags(Flags) {}
 
+      void VisitCallExpr(const CallExpr *CE) {
+        Outer.add(CE->getCalleeDecl(), Flags);
+      }
       void VisitDeclRefExpr(const DeclRefExpr *DRE) {
         const Decl *D = DRE->getDecl();
         // UsingShadowDecl allows us to record the UsingDecl.
diff --git a/clang-tools-extra/clangd/HeaderSourceSwitch.cpp b/clang-tools-extra/clangd/HeaderSourceSwitch.cpp
index 698f2460fea57..f90e46a24f329 100644
--- a/clang-tools-extra/clangd/HeaderSourceSwitch.cpp
+++ b/clang-tools-extra/clangd/HeaderSourceSwitch.cpp
@@ -97,7 +97,7 @@ llvm::Optional<Path> getCorrespondingHeaderOrSource(const Path &OriginalFile,
   //
   // For each symbol in the original file, we get its target location (decl or
   // def) from the index, then award that target file.
-  bool IsHeader = isHeaderFile(OriginalFile, AST.getASTContext().getLangOpts());
+  bool IsHeader = isHeaderFile(OriginalFile, AST.getLangOpts());
   Index->lookup(Request, [&](const Symbol &Sym) {
     if (IsHeader)
       AwardTarget(Sym.Definition.FileURI);
diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index c14ff1b3fe631..9053bc08b4ec3 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -367,8 +367,7 @@ HoverInfo getHoverContents(const DefinedMacro &Macro, ParsedAST &AST) {
   SourceLocation StartLoc = Macro.Info->getDefinitionLoc();
   SourceLocation EndLoc = Macro.Info->getDefinitionEndLoc();
   if (EndLoc.isValid()) {
-    EndLoc = Lexer::getLocForEndOfToken(EndLoc, 0, SM,
-                                        AST.getASTContext().getLangOpts());
+    EndLoc = Lexer::getLocForEndOfToken(EndLoc, 0, SM, AST.getLangOpts());
     bool Invalid;
     StringRef Buffer = SM.getBufferData(SM.getFileID(StartLoc), &Invalid);
     if (!Invalid) {
@@ -391,7 +390,7 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
   const SourceManager &SM = AST.getSourceManager();
   llvm::Optional<HoverInfo> HI;
   SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getASTContext().getLangOpts()));
+      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
 
   if (auto Deduced = getDeducedType(AST.getASTContext(), SourceLocationBeg)) {
     // Find the corresponding decl to populate kind and fetch documentation.
@@ -435,9 +434,8 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
           tooling::applyAllReplacements(HI->Definition, Replacements))
     HI->Definition = *Formatted;
 
-  HI->SymRange =
-      getTokenRange(AST.getASTContext().getSourceManager(),
-                    AST.getASTContext().getLangOpts(), SourceLocationBeg);
+  HI->SymRange = getTokenRange(AST.getASTContext().getSourceManager(),
+                               AST.getLangOpts(), SourceLocationBeg);
   return HI;
 }
 
diff --git a/clang-tools-extra/clangd/JSONTransport.cpp b/clang-tools-extra/clangd/JSONTransport.cpp
index 4921035b6dbb3..6351b8056b3fa 100644
--- a/clang-tools-extra/clangd/JSONTransport.cpp
+++ b/clang-tools-extra/clangd/JSONTransport.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 #include "Logger.h"
 #include "Protocol.h" // For LSPError
+#include "Shutdown.h"
 #include "Transport.h"
 #include "llvm/Support/Errno.h"
+#include "llvm/Support/Error.h"
 
 namespace clang {
 namespace clangd {
@@ -81,6 +83,10 @@ class JSONTransport : public Transport {
 
   llvm::Error loop(MessageHandler &Handler) override {
     while (!feof(In)) {
+      if (shutdownRequested())
+        return llvm::createStringError(
+            std::make_error_code(std::errc::operation_canceled),
+            "Got signal, shutting down");
       if (ferror(In))
         return llvm::errorCodeToError(
             std::error_code(errno, std::system_category()));
@@ -167,7 +173,7 @@ bool JSONTransport::handleMessage(llvm::json::Value Message,
 }
 
 // Tries to read a line up to and including \n.
-// If failing, feof() or ferror() will be set.
+// If failing, feof(), ferror(), or shutdownRequested() will be set.
 bool readLine(std::FILE *In, std::string &Out) {
   static constexpr int BufSize = 1024;
   size_t Size = 0;
@@ -175,7 +181,8 @@ bool readLine(std::FILE *In, std::string &Out) {
   for (;;) {
     Out.resize(Size + BufSize);
     // Handle EINTR which is sent when a debugger attaches on some platforms.
-    if (!llvm::sys::RetryAfterSignal(nullptr, ::fgets, &Out[Size], BufSize, In))
+    if (!retryAfterSignalUnlessShutdown(
+            nullptr, [&] { return std::fgets(&Out[Size], BufSize, In); }))
       return false;
     clearerr(In);
     // If the line contained null bytes, anything after it (including \n) will
@@ -190,7 +197,7 @@ bool readLine(std::FILE *In, std::string &Out) {
 }
 
 // Returns None when:
-//  - ferror() or feof() are set.
+//  - ferror(), feof(), or shutdownRequested() are set.
 //  - Content-Length is missing or empty (protocol error)
 llvm::Optional<std::string> JSONTransport::readStandardMessage() {
   // A Language Server Protocol message starts with a set of HTTP headers,
@@ -244,8 +251,9 @@ llvm::Optional<std::string> JSONTransport::readStandardMessage() {
   std::string JSON(ContentLength, '\0');
   for (size_t Pos = 0, Read; Pos < ContentLength; Pos += Read) {
     // Handle EINTR which is sent when a debugger attaches on some platforms.
-    Read = llvm::sys::RetryAfterSignal(0u, ::fread, &JSON[Pos], 1,
-                                       ContentLength - Pos, In);
+    Read = retryAfterSignalUnlessShutdown(0, [&]{
+      return std::fread(&JSON[Pos], 1, ContentLength - Pos, In);
+    });
     if (Read == 0) {
       elog("Input was aborted. Read only {0} bytes of expected {1}.", Pos,
            ContentLength);
@@ -263,7 +271,7 @@ llvm::Optional<std::string> JSONTransport::readStandardMessage() {
 // - messages are delimited by '---' on a line by itself
 // - lines starting with # are ignored.
 // This is a testing path, so favor simplicity over performance here.
-// When returning None, feof() or ferror() will be set.
+// When returning None, feof(), ferror(), or shutdownRequested() will be set.
 llvm::Optional<std::string> JSONTransport::readDelimitedMessage() {
   std::string JSON;
   std::string Line;
@@ -280,6 +288,8 @@ llvm::Optional<std::string> JSONTransport::readDelimitedMessage() {
     JSON += Line;
   }
 
+  if (shutdownRequested())
+    return llvm::None;
   if (ferror(In)) {
     elog("Input error while reading message!");
     return llvm::None;
diff --git a/clang-tools-extra/clangd/ParsedAST.h b/clang-tools-extra/clangd/ParsedAST.h
index 0b4a6ab73df83..f2afc264e23a1 100644
--- a/clang-tools-extra/clangd/ParsedAST.h
+++ b/clang-tools-extra/clangd/ParsedAST.h
@@ -77,6 +77,10 @@ class ParsedAST {
     return getASTContext().getSourceManager();
   }
 
+  const LangOptions &getLangOpts() const {
+    return getASTContext().getLangOpts();
+  }
+
   /// This function returns top-level decls present in the main file of the AST.
   /// The result does not include the decls that come from the preamble.
   /// (These should be const, but RecursiveASTVisitor requires Decl*).
diff --git a/clang-tools-extra/clangd/Selection.cpp b/clang-tools-extra/clangd/Selection.cpp
index c91cd24e2f25f..ffa48f3a57d96 100644
--- a/clang-tools-extra/clangd/Selection.cpp
+++ b/clang-tools-extra/clangd/Selection.cpp
@@ -34,95 +34,289 @@ namespace {
 using Node = SelectionTree::Node;
 using ast_type_traits::DynTypedNode;
 
-// Identifies which tokens are selected, and evaluates claims of source ranges
-// by AST nodes. Tokens may be claimed only once: first-come, first-served.
-class SelectedTokens {
+// An IntervalSet maintains a set of disjoint subranges of an array.
+//
+// Initially, it contains the entire array.
+//           [-----------------------------------------------------------]
+//
+// When a range is erased(), it will typically split the array in two.
+//  Claim:                     [--------------------]
+//  after:   [----------------]                      [-------------------]
+//
+// erase() returns the segments actually erased. Given the state above:
+//  Claim:          [---------------------------------------]
+//  Out:            [---------]                      [------]
+//  After:   [-----]                                         [-----------]
+//
+// It is used to track (expanded) tokens not yet associated with an AST node.
+// On traversing an AST node, its token range is erased from the unclaimed set.
+// The tokens actually removed are associated with that node, and hit-tested
+// against the selection to determine whether the node is selected.
+template <typename T>
+class IntervalSet {
+public:
+  IntervalSet(llvm::ArrayRef<T> Range) { UnclaimedRanges.insert(Range); }
+
+  // Removes the elements of Claim from the set, modifying or removing ranges
+  // that overlap it.
+  // Returns the continuous subranges of Claim that were actually removed.
+  llvm::SmallVector<llvm::ArrayRef<T>, 4> erase(llvm::ArrayRef<T> Claim) {
+    llvm::SmallVector<llvm::ArrayRef<T>, 4> Out;
+    if (Claim.empty())
+      return Out;
+
+    // General case:
+    // Claim:                   [-----------------]
+    // UnclaimedRanges: [-A-] [-B-] [-C-] [-D-] [-E-] [-F-] [-G-]
+    // Overlap:               ^first                  ^second
+    // Ranges C and D are fully included. Ranges B and E must be trimmed.
+    auto Overlap = std::make_pair(
+        UnclaimedRanges.lower_bound({Claim.begin(), Claim.begin()}), // C
+        UnclaimedRanges.lower_bound({Claim.end(), Claim.end()}));    // F
+    // Rewind to cover B.
+    if (Overlap.first != UnclaimedRanges.begin()) {
+      --Overlap.first;
+      // ...unless B isn't selected at all.
+      if (Overlap.first->end() <= Claim.begin())
+          ++Overlap.first;
+    }
+    if (Overlap.first == Overlap.second)
+      return Out;
+
+    // First, copy all overlapping ranges into the output.
+    auto OutFirst = Out.insert(Out.end(), Overlap.first, Overlap.second);
+    // If any of the overlapping ranges were sliced by the claim, split them:
+    //  - restrict the returned range to the claimed part
+    //  - save the unclaimed part so it can be reinserted
+    llvm::ArrayRef<T> RemainingHead, RemainingTail;
+    if (Claim.begin() > OutFirst->begin()) {
+      RemainingHead = {OutFirst->begin(), Claim.begin()};
+      *OutFirst = {Claim.begin(), OutFirst->end()};
+    }
+    if (Claim.end() < Out.back().end()) {
+      RemainingTail = {Claim.end(), Out.back().end()};
+      Out.back() = {Out.back().begin(), Claim.end()};
+    }
+
+    // Erase all the overlapping ranges (invalidating all iterators).
+    UnclaimedRanges.erase(Overlap.first, Overlap.second);
+    // Reinsert ranges that were merely trimmed.
+    if (!RemainingHead.empty())
+      UnclaimedRanges.insert(RemainingHead);
+    if (!RemainingTail.empty())
+      UnclaimedRanges.insert(RemainingTail);
+
+    return Out;
+  }
+
+private:
+  using TokenRange = llvm::ArrayRef<T>;
+  struct RangeLess {
+    bool operator()(llvm::ArrayRef<T> L, llvm::ArrayRef<T> R) const {
+      return L.begin() < R.begin();
+    }
+  };
+
+  // Disjoint sorted unclaimed ranges of expanded tokens.
+  std::set<llvm::ArrayRef<T>, RangeLess>
+      UnclaimedRanges;
+};
+
+// Sentinel value for the selectedness of a node where we've seen no tokens yet.
+// This resolves to Unselected if no tokens are ever seen.
+// But Unselected + Complete -> Partial, while NoTokens + Complete --> Complete.
+// This value is never exposed publicly.
+constexpr SelectionTree::Selection NoTokens =
+    static_cast<SelectionTree::Selection>(
+        static_cast<unsigned char>(SelectionTree::Complete + 1));
+
+// Nodes start with NoTokens, and then use this function to aggregate the
+// selectedness as more tokens are found.
+void update(SelectionTree::Selection &Result, SelectionTree::Selection New) {
+  if (New == NoTokens)
+    return;
+  if (Result == NoTokens)
+    Result = New;
+  else if (Result != New)
+    // Can only be completely selected (or unselected) if all tokens are.
+    Result = SelectionTree::Partial;
+}
+
+
+// SelectionTester can determine whether a range of tokens from the PP-expanded
+// stream (corresponding to an AST node) is considered selected.
+//
+// When the tokens result from macro expansions, the appropriate tokens in the
+// main file are examined (macro invocation or args). Similarly for #includes.
+//
+// It tests each token in the range (not just the endpoints) as contiguous
+// expanded tokens may not have contiguous spellings (with macros).
+//
+// Non-token text, and tokens not modeled in the AST (comments, semicolons)
+// are ignored when determining selectedness.
+class SelectionTester {
 public:
-  SelectedTokens(llvm::ArrayRef<syntax::Token> Spelled, const SourceManager &SM,
-                 unsigned SelBegin, unsigned SelEnd)
-      : SelBegin(SelBegin), SelEnd(SelEnd) {
-    // Extract bounds and selected-ness for all tokens spelled in the file.
-    Tokens.reserve(Spelled.size());
-    for (const auto& Tok : Spelled) {
+  // The selection is offsets [SelBegin, SelEnd) in SelFile.
+  SelectionTester(const syntax::TokenBuffer &Buf, FileID SelFile,
+                  unsigned SelBegin, unsigned SelEnd, const SourceManager &SM)
+      : SelFile(SelFile), SM(SM) {
+    // Find all tokens (partially) selected in the file.
+    auto AllSpelledTokens = Buf.spelledTokens(SelFile);
+    const syntax::Token *SelFirst =
+        llvm::partition_point(AllSpelledTokens, [&](const syntax::Token &Tok) {
+          return SM.getFileOffset(Tok.endLocation()) <= SelBegin;
+        });
+    const syntax::Token *SelLimit = std::partition_point(
+        SelFirst, AllSpelledTokens.end(), [&](const syntax::Token &Tok) {
+          return SM.getFileOffset(Tok.location()) < SelEnd;
+        });
+    // Precompute selectedness and offset for selected spelled tokens.
+    for (const syntax::Token *T = SelFirst; T < SelLimit; ++T) {
       // As well as comments, don't count semicolons as real tokens.
       // They're not properly claimed as expr-statement is missing from the AST.
-      if (Tok.kind() == tok::comment || Tok.kind() == tok::semi)
+      if (T->kind() == tok::comment || T->kind() == tok::semi)
         continue;
-
-      Tokens.emplace_back();
-      TokInfo &S = Tokens.back();
-      S.StartOffset = SM.getFileOffset(Tok.location());
-      S.EndOffset = S.StartOffset + Tok.length();
-      if (S.StartOffset >= SelBegin && S.EndOffset <= SelEnd)
+      SpelledTokens.emplace_back();
+      Tok &S = SpelledTokens.back();
+      S.Offset = SM.getFileOffset(T->location());
+      if (S.Offset >= SelBegin && S.Offset + T->length() <= SelEnd)
         S.Selected = SelectionTree::Complete;
-      else if (S.EndOffset > SelBegin && S.StartOffset < SelEnd)
-        S.Selected = SelectionTree::Partial;
       else
-        S.Selected = SelectionTree::Unselected;
-      S.Claimed = false;
+        S.Selected = SelectionTree::Partial;
     }
   }
 
-  // Associates any tokens overlapping [Begin, End) with an AST node.
-  // Tokens that were already claimed by another AST node are not claimed again.
-  // Updates Result if the node is selected in the sense of SelectionTree.
-  void claim(unsigned Begin, unsigned End, SelectionTree::Selection &Result) {
-    assert(Begin <= End);
+  // Test whether a consecutive range of tokens is selected.
+  // The tokens are taken from the expanded token stream.
+  SelectionTree::Selection
+  test(llvm::ArrayRef<syntax::Token> ExpandedTokens) const {
+    if (SpelledTokens.empty())
+      return NoTokens;
+    SelectionTree::Selection Result = NoTokens;
+    while (!ExpandedTokens.empty()) {
+      // Take consecutive tokens from the same context together for efficiency.
+      FileID FID = SM.getFileID(ExpandedTokens.front().location());
+      auto Batch = ExpandedTokens.take_while([&](const syntax::Token &T) {
+        return SM.getFileID(T.location()) == FID;
+      });
+      assert(!Batch.empty());
+      ExpandedTokens = ExpandedTokens.drop_front(Batch.size());
+
+      update(Result, testChunk(FID, Batch));
+    }
+    return Result;
+  }
 
-    // Fast-path for missing the selection entirely.
-    if (Begin >= SelEnd || End <= SelBegin)
-      return;
-
-    // We will consider the range (at least partially) selected if it hit any
-    // selected and previously unclaimed token.
-    bool ClaimedAnyToken = false;
-    // The selection is (at most) partial if:
-    // - any claimed token is partially selected
-    // - any token in the range is unselected
-    bool PartialSelection = false;
-
-    // Find the first token that (maybe) overlaps the claimed range.
-    auto Start = llvm::partition_point(Tokens, [&](const TokInfo &Tok) {
-      return Tok.EndOffset <= Begin;
-    });
-    // Iterate over every token that overlaps the range.
-    // Claim selected tokens, and update the two result flags.
-    for (auto It = Start; It != Tokens.end() && It->StartOffset < End; ++It) {
-      if (It->Selected) {
-        if (!It->Claimed) {
-          // Token is selected, in the node's range, and unclaimed; claim it.
-          It->Claimed = true;
-          ClaimedAnyToken = true;
-          // If the token was only partially selected, so is the node.
-          PartialSelection |= (It->Selected == SelectionTree::Partial);
-        }
-      } else {
-        // If the node covers an unselected token, it's not completely selected.
-        PartialSelection = true;
+  // Cheap check whether any of the tokens in R might be selected.
+  // If it returns false, test() will return NoTokens or Unselected.
+  // If it returns true, test() may return any value.
+  bool mayHit(SourceRange R) const {
+    if (SpelledTokens.empty())
+      return false;
+    auto B = SM.getDecomposedLoc(R.getBegin());
+    auto E = SM.getDecomposedLoc(R.getEnd());
+    if (B.first == SelFile && E.first == SelFile)
+      if (E.second < SpelledTokens.front().Offset ||
+          B.second > SpelledTokens.back().Offset)
+        return false;
+    return true;
+  }
+
+private:
+  // Hit-test a consecutive range of tokens from a single file ID.
+  SelectionTree::Selection
+  testChunk(FileID FID, llvm::ArrayRef<syntax::Token> Batch) const {
+    assert(!Batch.empty());
+    SourceLocation StartLoc = Batch.front().location();
+    // There are several possible categories of FileID depending on how the
+    // preprocessor was used to generate these tokens:
+    //   main file, #included file, macro args, macro bodies.
+    // We need to identify the main-file tokens that represent Batch, and
+    // determine whether we want to exclusively claim them. Regular tokens
+    // represent one AST construct, but a macro invocation can represent many.
+
+    // Handle tokens written directly in the main file.
+    if (FID == SelFile) {
+      return testTokenRange(SM.getFileOffset(Batch.front().location()),
+                            SM.getFileOffset(Batch.back().location()));
+    }
+
+    // Handle tokens in another file #included into the main file.
+    // Check if the #include is selected, but don't claim it exclusively.
+    if (StartLoc.isFileID()) {
+      for (SourceLocation Loc = Batch.front().location(); Loc.isValid();
+           Loc = SM.getIncludeLoc(SM.getFileID(Loc))) {
+        if (SM.getFileID(Loc) == SelFile)
+          // FIXME: use whole #include directive, not just the filename string.
+          return testToken(SM.getFileOffset(Loc));
       }
+      return NoTokens;
     }
 
-    // If some tokens were previously claimed (Result != Unselected), we may
-    // upgrade from Partial->Complete, even if no new tokens were claimed.
-    // Important for [[int a]].
-    if (ClaimedAnyToken || Result) {
-      Result = std::max(Result, PartialSelection ? SelectionTree::Partial
-                                                 : SelectionTree::Complete);
+    assert(StartLoc.isMacroID());
+    // Handle tokens that were passed as a macro argument.
+    SourceLocation ArgStart = SM.getTopMacroCallerLoc(StartLoc);
+    if (SM.getFileID(ArgStart) == SelFile) {
+      SourceLocation ArgEnd = SM.getTopMacroCallerLoc(Batch.back().location());
+      return testTokenRange(SM.getFileOffset(ArgStart),
+                            SM.getFileOffset(ArgEnd));
     }
+
+    // Handle tokens produced by non-argument macro expansion.
+    // Check if the macro name is selected, don't claim it exclusively.
+    auto Expansion = SM.getDecomposedExpansionLoc(StartLoc);
+    if (Expansion.first == SelFile)
+      // FIXME: also check ( and ) for function-like macros?
+      return testToken(Expansion.second);
+    else
+      return NoTokens;
   }
 
-private:
-  struct TokInfo {
-    unsigned StartOffset;
-    unsigned EndOffset;
+  // Is the closed token range [Begin, End] selected?
+  SelectionTree::Selection testTokenRange(unsigned Begin, unsigned End) const {
+    assert(Begin <= End);
+    // Outside the selection entirely?
+    if (End < SpelledTokens.front().Offset ||
+        Begin > SpelledTokens.back().Offset)
+      return SelectionTree::Unselected;
+
+    // Compute range of tokens.
+    auto B = llvm::partition_point(
+        SpelledTokens, [&](const Tok &T) { return T.Offset < Begin; });
+    auto E = std::partition_point(
+        B, SpelledTokens.end(), [&](const Tok &T) { return T.Offset <= End; });
+
+    // Aggregate selectedness of tokens in range.
+    bool ExtendsOutsideSelection = Begin < SpelledTokens.front().Offset ||
+                                   End > SpelledTokens.back().Offset;
+    SelectionTree::Selection Result =
+        ExtendsOutsideSelection ? SelectionTree::Unselected : NoTokens;
+    for (auto It = B; It != E; ++It)
+      update(Result, It->Selected);
+    return Result;
+  }
+
+  // Is the token at `Offset` selected?
+  SelectionTree::Selection testToken(unsigned Offset) const {
+    // Outside the selection entirely?
+    if (Offset < SpelledTokens.front().Offset ||
+        Offset > SpelledTokens.back().Offset)
+      return SelectionTree::Unselected;
+    // Find the token, if it exists.
+    auto It = llvm::partition_point(
+        SpelledTokens, [&](const Tok &T) { return T.Offset < Offset; });
+    if (It != SpelledTokens.end() && It->Offset == Offset)
+      return It->Selected;
+    return NoTokens;
+  }
+
+  struct Tok {
+    unsigned Offset;
     SelectionTree::Selection Selected;
-    bool Claimed;
-    bool operator<(const TokInfo &Other) const {
-      return StartOffset < Other.StartOffset;
-    }
   };
-  std::vector<TokInfo> Tokens;
-  unsigned SelBegin, SelEnd;
+  std::vector<Tok> SpelledTokens;
+  FileID SelFile;
+  const SourceManager &SM;
 };
 
 // Show the type of a node for debugging.
@@ -195,16 +389,6 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
     V.TraverseAST(AST);
     assert(V.Stack.size() == 1 && "Unpaired push/pop?");
     assert(V.Stack.top() == &V.Nodes.front());
-    // We selected TUDecl if tokens were unclaimed (or the file is empty).
-    SelectionTree::Selection UnclaimedTokens = SelectionTree::Unselected;
-    V.Claimed.claim(Begin, End, UnclaimedTokens);
-    if (UnclaimedTokens || V.Nodes.size() == 1) {
-      StringRef FileContent = AST.getSourceManager().getBufferData(File);
-      // Don't require the trailing newlines to be selected.
-      bool SelectedAll = Begin == 0 && End >= FileContent.rtrim().size();
-      V.Stack.top()->Selected =
-          SelectedAll ? SelectionTree::Complete : SelectionTree::Partial;
-    }
     return std::move(V.Nodes);
   }
 
@@ -289,11 +473,8 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
 #ifndef NDEBUG
         PrintPolicy(PP),
 #endif
-        Claimed(Tokens.spelledTokens(SelFile), SM, SelBegin, SelEnd),
-        SelFile(SelFile),
-        SelBeginTokenStart(SM.getFileOffset(Lexer::GetBeginningOfToken(
-            SM.getComposedLoc(SelFile, SelBegin), SM, LangOpts))),
-        SelEnd(SelEnd) {
+        TokenBuf(Tokens), SelChecker(Tokens, SelFile, SelBegin, SelEnd, SM),
+        UnclaimedExpandedTokens(Tokens.expandedTokens()) {
     // Ensure we have a node for the TU decl, regardless of traversal scope.
     Nodes.emplace_back();
     Nodes.back().ASTNode = DynTypedNode::create(*AST.getTranslationUnitDecl());
@@ -346,18 +527,12 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
   // don't intersect the selection may be recursively skipped.
   bool canSafelySkipNode(const DynTypedNode &N) {
     SourceRange S = N.getSourceRange();
-    auto B = SM.getDecomposedLoc(S.getBegin());
-    auto E = SM.getDecomposedLoc(S.getEnd());
-    // Node lies in a macro expansion?
-    if (B.first != SelFile || E.first != SelFile)
-      return false;
-    // Node intersects selection tokens?
-    if (B.second < SelEnd && E.second >= SelBeginTokenStart)
-      return false;
-    // Otherwise, allow skipping over the node.
-    dlog("{1}skip: {0}", printNodeToString(N, PrintPolicy), indent());
-    dlog("{1}skipped range = {0}", S.printToString(SM), indent(1));
-    return true;
+    if (!SelChecker.mayHit(S)) {
+      dlog("{1}skip: {0}", printNodeToString(N, PrintPolicy), indent());
+      dlog("{1}skipped range = {0}", S.printToString(SM), indent(1));
+      return true;
+    }
+    return false;
   }
 
   // There are certain nodes we want to treat as leaves in the SelectionTree,
@@ -377,11 +552,9 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
     Nodes.emplace_back();
     Nodes.back().ASTNode = std::move(Node);
     Nodes.back().Parent = Stack.top();
+    Nodes.back().Selected = NoTokens;
     Stack.push(&Nodes.back());
     claimRange(Early, Nodes.back().Selected);
-    // Early hit detection never selects the whole node.
-    if (Nodes.back().Selected)
-      Nodes.back().Selected = SelectionTree::Partial;
   }
 
   // Pops a node off the ancestor stack, and finalizes it. Pairs with push().
@@ -390,6 +563,8 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
     Node &N = *Stack.top();
     dlog("{1}pop: {0}", printNodeToString(N.ASTNode, PrintPolicy), indent(-1));
     claimRange(N.ASTNode.getSourceRange(), N.Selected);
+    if (N.Selected == NoTokens)
+      N.Selected = SelectionTree::Unselected;
     if (N.Selected || !N.Children.empty()) {
       // Attach to the tree.
       N.Parent->Children.push_back(&N);
@@ -424,31 +599,12 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
   // This is usually called from pop(), so we can take children into account.
   // The existing state of Result is relevant (early/late claims can interact).
   void claimRange(SourceRange S, SelectionTree::Selection &Result) {
-    if (!S.isValid())
-      return;
-    // toHalfOpenFileRange() allows selection of constructs in macro args. e.g:
-    //   #define LOOP_FOREVER(Body) for(;;) { Body }
-    //   void IncrementLots(int &x) {
-    //     LOOP_FOREVER( ++x; )
-    //   }
-    // Selecting "++x" or "x" will do the right thing.
-    auto Range = toHalfOpenFileRange(SM, LangOpts, S);
-    assert(Range && "We should be able to get the File Range");
-    dlog("{1}claimRange: {0}", Range->printToString(SM), indent());
-    auto B = SM.getDecomposedLoc(Range->getBegin());
-    auto E = SM.getDecomposedLoc(Range->getEnd());
-    // Otherwise, nodes in macro expansions can't be selected.
-    if (B.first != SelFile || E.first != SelFile)
-      return;
-    // Attempt to claim the remaining range. If there's nothing to claim, only
-    // children were selected.
-    Claimed.claim(B.second, E.second, Result);
-    if (Result)
-      dlog("{1}hit selection: {0}",
-           SourceRange(SM.getComposedLoc(B.first, B.second),
-                       SM.getComposedLoc(E.first, E.second))
-               .printToString(SM),
-           indent());
+    for (const auto &ClaimedRange :
+         UnclaimedExpandedTokens.erase(TokenBuf.expandedTokens(S)))
+      update(Result, SelChecker.test(ClaimedRange));
+
+    if (Result && Result != NoTokens)
+      dlog("{1}hit selection: {0}", S.printToString(SM), indent());
   }
 
   std::string indent(int Offset = 0) {
@@ -463,17 +619,11 @@ class SelectionVisitor : public RecursiveASTVisitor<SelectionVisitor> {
 #ifndef NDEBUG
   const PrintingPolicy &PrintPolicy;
 #endif
+  const syntax::TokenBuffer &TokenBuf;
   std::stack<Node *> Stack;
-  SelectedTokens Claimed;
+  SelectionTester SelChecker;
+  IntervalSet<syntax::Token> UnclaimedExpandedTokens;
   std::deque<Node> Nodes; // Stable pointers as we add more nodes.
-  FileID SelFile;
-  // If the selection start slices a token in half, the beginning of that token.
-  // This is useful for checking whether the end of a token range overlaps
-  // the selection: range.end < SelBeginTokenStart is equivalent to
-  // range.end + measureToken(range.end) < SelBegin (assuming range.end points
-  // to a token), and it saves a lex every time.
-  unsigned SelBeginTokenStart;
-  unsigned SelEnd;
 };
 
 } // namespace
@@ -513,8 +663,9 @@ static std::pair<unsigned, unsigned> pointBounds(unsigned Offset, FileID FID,
     return {Offset - 1, Offset};
   // We could choose either this byte or the previous. Usually we prefer the
   // character on the right of the cursor (or under a block cursor).
-  // But if that's whitespace, we likely want the token on the left.
-  if (isWhitespace(Buf[Offset]) && !isWhitespace(Buf[Offset - 1]))
+  // But if that's whitespace/semicolon, we likely want the token on the left.
+  auto IsIgnoredChar = [](char C) { return isWhitespace(C) || C == ';'; };
+  if (IsIgnoredChar(Buf[Offset]) && !IsIgnoredChar(Buf[Offset - 1]))
     return {Offset - 1, Offset};
   return {Offset, Offset + 1};
 }
diff --git a/clang-tools-extra/clangd/Selection.h b/clang-tools-extra/clangd/Selection.h
index 9bcb9d5fb01f0..a7050c49be6ba 100644
--- a/clang-tools-extra/clangd/Selection.h
+++ b/clang-tools-extra/clangd/Selection.h
@@ -76,7 +76,7 @@ class SelectionTree {
                 unsigned Start, unsigned End);
 
   // Describes to what extent an AST node is covered by the selection.
-  enum Selection {
+  enum Selection : unsigned char {
     // The AST node owns no characters covered by the selection.
     // Note that characters owned by children don't count:
     //   if (x == 0) scream();
diff --git a/clang-tools-extra/clangd/SemanticSelection.cpp b/clang-tools-extra/clangd/SemanticSelection.cpp
index 91a5582ac29a4..cbbf31f1b05b5 100644
--- a/clang-tools-extra/clangd/SemanticSelection.cpp
+++ b/clang-tools-extra/clangd/SemanticSelection.cpp
@@ -30,7 +30,7 @@ llvm::Expected<std::vector<Range>> getSemanticRanges(ParsedAST &AST,
                                                      Position Pos) {
   std::vector<Range> Result;
   const auto &SM = AST.getSourceManager();
-  const auto &LangOpts = AST.getASTContext().getLangOpts();
+  const auto &LangOpts = AST.getLangOpts();
 
   auto FID = SM.getMainFileID();
   auto Offset = positionToOffset(SM.getBufferData(FID), Pos);
diff --git a/clang-tools-extra/clangd/Shutdown.cpp b/clang-tools-extra/clangd/Shutdown.cpp
new file mode 100644
index 0000000000000..dfea46d8dfeb8
--- /dev/null
+++ b/clang-tools-extra/clangd/Shutdown.cpp
@@ -0,0 +1,39 @@
+//===--- Shutdown.cpp - Unclean exit scenarios ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Shutdown.h"
+
+#include <atomic>
+#include <thread>
+
+namespace clang {
+namespace clangd {
+
+void abortAfterTimeout(std::chrono::seconds Timeout) {
+  // This is more portable than sys::WatchDog, and yields a stack trace.
+  std::thread([Timeout] {
+    std::this_thread::sleep_for(Timeout);
+    std::abort();
+  }).detach();
+}
+
+static std::atomic<bool> ShutdownRequested = {false};
+
+void requestShutdown() {
+  if (ShutdownRequested.exchange(true))
+    // This is the second shutdown request. Exit hard.
+    std::abort();
+}
+
+bool shutdownRequested() {
+  return ShutdownRequested;
+}
+
+} // namespace clangd
+} // namespace clang
+
diff --git a/clang-tools-extra/clangd/Shutdown.h b/clang-tools-extra/clangd/Shutdown.h
new file mode 100644
index 0000000000000..3097f6a3e63c7
--- /dev/null
+++ b/clang-tools-extra/clangd/Shutdown.h
@@ -0,0 +1,84 @@
+//===--- Shutdown.h - Unclean exit scenarios --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// LSP specifies a protocol for shutting down: a `shutdown` request followed
+// by an `exit` notification. If this protocol is followed, clangd should
+// finish outstanding work and exit with code 0.
+//
+// The way this works in the happy case:
+//  - when ClangdLSPServer gets `shutdown`, it sets a flag
+//  - when ClangdLSPServer gets `exit`, it returns false to indicate end-of-LSP
+//  - Transport::loop() returns with no error
+//  - ClangdServer::run() checks the shutdown flag and returns with no error.
+//  - we `return 0` from main()
+//  - destructor of ClangdServer and other main()-locals runs.
+//    This blocks until outstanding requests complete (results are ignored)
+//  - global destructors run, such as fallback deletion of temporary files
+//
+// There are a number of things that can go wrong. Some are handled here, and
+// some elsewhere.
+//  - `exit` notification with no `shutdown`:
+//    ClangdServer::run() sees this and returns false, main() returns nonzero.
+//  - stdin/stdout are closed
+//    The Transport detects this while doing IO and returns an error from loop()
+//    ClangdServer::run() logs a message and then returns false, etc
+//  - a request thread gets stuck, so the ClangdServer destructor hangs.
+//    Before returning from main(), we start a watchdog thread to abort() the
+//    process if it takes too long to exit. See abortAfterTimeout().
+//  - clangd crashes (e.g. segfault or assertion)
+//    A fatal signal is sent (SEGV, ABRT, etc)
+//    The installed signal handler prints a stack trace and exits.
+//  - parent process goes away or tells us to shut down
+//    A "graceful shutdown" signal is sent (TERM, HUP, etc).
+//    The installed signal handler calls requestShutdown() which sets a flag.
+//    The Transport IO is interrupted, and Transport::loop() checks the flag and
+//    returns an error, etc.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_SHUTDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_SHUTDOWN_H
+
+#include <cerrno>
+#include <chrono>
+
+namespace clang {
+namespace clangd {
+
+/// Causes this process to crash if still running after Timeout.
+void abortAfterTimeout(std::chrono::seconds Timeout);
+
+/// Sets a flag to indicate that clangd was sent a shutdown signal, and the
+/// transport loop should exit at the next opportunity.
+/// If shutdown was already requested, aborts the process.
+/// This function is threadsafe and signal-safe.
+void requestShutdown();
+/// Checks whether requestShutdown() was called.
+/// This function is threadsafe and signal-safe.
+bool shutdownRequested();
+
+/// Retry an operation if it gets interrupted by a signal.
+/// This is like llvm::sys::RetryAfterSignal, except that if shutdown was
+/// requested (which interrupts IO), we'll fail rather than retry.
+template <typename Fun, typename Ret = decltype(std::declval<Fun>()())>
+Ret retryAfterSignalUnlessShutdown(
+    const typename std::enable_if<true, Ret>::type &Fail, // Suppress deduction.
+    const Fun &F) {
+  Ret Res;
+  do {
+    if (shutdownRequested())
+      return Fail;
+    errno = 0;
+    Res = F();
+  } while (Res == Fail && errno == EINTR);
+  return Res;
+}
+
+} // namespace clangd
+} // namespace clang
+
+#endif
diff --git a/clang-tools-extra/clangd/SourceCode.h b/clang-tools-extra/clangd/SourceCode.h
index 3b8aacef9bf17..f75be998dc2d4 100644
--- a/clang-tools-extra/clangd/SourceCode.h
+++ b/clang-tools-extra/clangd/SourceCode.h
@@ -223,6 +223,9 @@ struct Edit {
   /// Checks whether the Replacements are applicable to given Code.
   bool canApplyTo(llvm::StringRef Code) const;
 };
+/// A mapping from absolute file path (the one used for accessing the underlying
+/// VFS) to edits.
+using FileEdits = llvm::StringMap<Edit>;
 
 /// Formats the edits and code around it according to Style. Changes
 /// Replacements to formatted ones if succeeds.
diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp
index 6436e7a50c615..884c82d5b1909 100644
--- a/clang-tools-extra/clangd/TUScheduler.cpp
+++ b/clang-tools-extra/clangd/TUScheduler.cpp
@@ -407,8 +407,12 @@ void ASTWorker::update(ParseInputs Inputs, WantDiagnostics WantDiags) {
         llvm::join(Inputs.CompileCommand.CommandLine, " "));
     // Rebuild the preamble and the AST.
     StoreDiags CompilerInvocationDiagConsumer;
-    std::unique_ptr<CompilerInvocation> Invocation =
-        buildCompilerInvocation(Inputs, CompilerInvocationDiagConsumer);
+    std::vector<std::string> CC1Args;
+    std::unique_ptr<CompilerInvocation> Invocation = buildCompilerInvocation(
+        Inputs, CompilerInvocationDiagConsumer, &CC1Args);
+    // Log cc1 args even (especially!) if creating invocation failed.
+    if (!CC1Args.empty())
+      vlog("Driver produced command: cc1 {0}", llvm::join(CC1Args, " "));
     std::vector<Diag> CompilerInvocationDiags =
         CompilerInvocationDiagConsumer.take();
     if (!Invocation) {
@@ -916,6 +920,13 @@ llvm::StringRef TUScheduler::getContents(PathRef File) const {
   return It->second->Contents;
 }
 
+llvm::StringMap<std::string> TUScheduler::getAllFileContents() const {
+  llvm::StringMap<std::string> Results;
+  for (auto &It : Files)
+    Results.try_emplace(It.getKey(), It.getValue()->Contents);
+  return Results;
+}
+
 void TUScheduler::run(llvm::StringRef Name,
                       llvm::unique_function<void()> Action) {
   if (!PreambleTasks)
diff --git a/clang-tools-extra/clangd/TUScheduler.h b/clang-tools-extra/clangd/TUScheduler.h
index ff2d4d485047f..de3b895499831 100644
--- a/clang-tools-extra/clangd/TUScheduler.h
+++ b/clang-tools-extra/clangd/TUScheduler.h
@@ -180,6 +180,9 @@ class TUScheduler {
   /// The returned StringRef may be invalidated by any write to TUScheduler.
   llvm::StringRef getContents(PathRef File) const;
 
+  /// Returns a snapshot of all file buffer contents, per last update().
+  llvm::StringMap<std::string> getAllFileContents() const;
+
   /// Schedule an async task with no dependencies.
   void run(llvm::StringRef Name, llvm::unique_function<void()> Action);
 
diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp
index de10e3c48e202..8bcc268d1b187 100644
--- a/clang-tools-extra/clangd/XRefs.cpp
+++ b/clang-tools-extra/clangd/XRefs.cpp
@@ -191,9 +191,8 @@ std::vector<LocatedSymbol> locateSymbolAt(ParsedAST &AST, Position Pos,
 
   // Macros are simple: there's no declaration/definition distinction.
   // As a consequence, there's no need to look them up in the index either.
-  SourceLocation MaybeMacroLocation =
-      SM.getMacroArgExpandedLocation(getBeginningOfIdentifier(
-          Pos, AST.getSourceManager(), AST.getASTContext().getLangOpts()));
+  SourceLocation MaybeMacroLocation = SM.getMacroArgExpandedLocation(
+      getBeginningOfIdentifier(Pos, AST.getSourceManager(), AST.getLangOpts()));
   std::vector<LocatedSymbol> Result;
   if (auto M = locateMacroAt(MaybeMacroLocation, AST.getPreprocessor())) {
     if (auto Loc = makeLocation(AST.getASTContext(),
@@ -366,7 +365,7 @@ std::vector<DocumentHighlight> findDocumentHighlights(ParsedAST &AST,
   auto References = findRefs(
       getDeclAtPosition(AST,
                         SM.getMacroArgExpandedLocation(getBeginningOfIdentifier(
-                            Pos, SM, AST.getASTContext().getLangOpts())),
+                            Pos, SM, AST.getLangOpts())),
                         Relations),
       AST);
 
@@ -374,9 +373,8 @@ std::vector<DocumentHighlight> findDocumentHighlights(ParsedAST &AST,
   // different kinds, deduplicate them.
   std::vector<DocumentHighlight> Result;
   for (const auto &Ref : References) {
-    if (auto Range =
-            getTokenRange(AST.getASTContext().getSourceManager(),
-                          AST.getASTContext().getLangOpts(), Ref.Loc)) {
+    if (auto Range = getTokenRange(AST.getASTContext().getSourceManager(),
+                                   AST.getLangOpts(), Ref.Loc)) {
       DocumentHighlight DH;
       DH.range = *Range;
       if (Ref.Role & index::SymbolRoleSet(index::SymbolRole::Write))
@@ -404,7 +402,7 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit,
     return Results;
   }
   auto Loc = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getASTContext().getLangOpts()));
+      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
   // TODO: should we handle macros, too?
   // We also show references to the targets of using-decls, so we include
   // DeclRelation::Underlying.
@@ -424,8 +422,7 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit,
                                  }),
                      MainFileRefs.end());
   for (const auto &Ref : MainFileRefs) {
-    if (auto Range =
-            getTokenRange(SM, AST.getASTContext().getLangOpts(), Ref.Loc)) {
+    if (auto Range = getTokenRange(SM, AST.getLangOpts(), Ref.Loc)) {
       Location Result;
       Result.range = *Range;
       Result.uri = URIForFile::canonicalize(*MainFilePath, *MainFilePath);
@@ -470,7 +467,7 @@ ReferencesResult findReferences(ParsedAST &AST, Position Pos, uint32_t Limit,
 std::vector<SymbolDetails> getSymbolInfo(ParsedAST &AST, Position Pos) {
   const SourceManager &SM = AST.getSourceManager();
   auto Loc = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getASTContext().getLangOpts()));
+      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
 
   std::vector<SymbolDetails> Results;
 
@@ -646,7 +643,7 @@ static void fillSuperTypes(const CXXRecordDecl &CXXRD, ASTContext &ASTCtx,
 const CXXRecordDecl *findRecordTypeAt(ParsedAST &AST, Position Pos) {
   const SourceManager &SM = AST.getSourceManager();
   SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getASTContext().getLangOpts()));
+      getBeginningOfIdentifier(Pos, SM, AST.getLangOpts()));
   DeclRelationSet Relations =
       DeclRelation::TemplatePattern | DeclRelation::Underlying;
   auto Decls = getDeclAtPosition(AST, SourceLocationBeg, Relations);
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/package.json b/clang-tools-extra/clangd/clients/clangd-vscode/package.json
index 05aafeb5f850c..8abf7e743e6f2 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/package.json
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/package.json
@@ -23,6 +23,7 @@
     "activationEvents": [
         "onLanguage:c",
         "onLanguage:cpp",
+        "onLanguage:cuda",
         "onLanguage:objective-c",
         "onLanguage:objective-cpp",
         "onCommand:clangd-vscode.activate"
@@ -64,6 +65,13 @@
                     "**/MSVC/*/include/**"
                 ],
                 "firstLine": "^/[/*].*-\\*-\\s*C\\+\\+\\s*-\\*-.*"
+            },
+            {
+                "id": "cuda",
+                "extensions": [
+                    ".cu",
+                    ".cuh"
+                ]
             }
         ],
         "configuration": {
diff --git a/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts b/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
index 330cf7ac262eb..1f96cffef2559 100644
--- a/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
+++ b/clang-tools-extra/clangd/clients/clangd-vscode/src/extension.ts
@@ -83,21 +83,15 @@ export function activate(context: vscode.ExtensionContext) {
   }
   const serverOptions: vscodelc.ServerOptions = clangd;
 
-  // Note that CUDA ('.cu') files are special. When opening files of all other
-  // extensions, VSCode would load clangd automatically. This is achieved by
-  // having a corresponding 'onLanguage:...' activation event in package.json.
-  // However, VSCode does not have CUDA as a supported language yet, so we
-  // cannot add a corresponding activationEvent for CUDA files and clangd will
-  // *not* load itself automatically on '.cu' files.
-  const cudaFilePattern: string = '**/*.{' + [ 'cu' ].join() + '}';
   const clientOptions: vscodelc.LanguageClientOptions = {
         // Register the server for c-family and cuda files.
         documentSelector: [
             { scheme: 'file', language: 'c' },
             { scheme: 'file', language: 'cpp' },
+            // cuda is not supported by vscode, but our extension does.
+            { scheme: 'file', language: 'cuda' },
             { scheme: 'file', language: 'objective-c'},
-            { scheme: 'file', language: 'objective-cpp'},
-            { scheme: 'file', pattern: cudaFilePattern },
+            { scheme: 'file', language: 'objective-cpp'}
         ],
         synchronize: !syncFileEvents ? undefined : {
         // FIXME: send sync file events when clangd provides implemenatations.
@@ -111,10 +105,10 @@ export function activate(context: vscode.ExtensionContext) {
                                                 serverOptions, clientOptions);
   if (getConfig<boolean>('semanticHighlighting')) {
     const semanticHighlightingFeature =
-      new semanticHighlighting.SemanticHighlightingFeature(clangdClient,
-        context);
+        new semanticHighlighting.SemanticHighlightingFeature(clangdClient,
+                                                             context);
     context.subscriptions.push(
-      vscode.Disposable.from(semanticHighlightingFeature));
+        vscode.Disposable.from(semanticHighlightingFeature));
     clangdClient.registerFeature(semanticHighlightingFeature);
   }
   console.log('Clang Language Server is now active!');
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp
index 00adbd84fd62f..191cd68ccb29e 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.cpp
+++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp
@@ -16,6 +16,7 @@
 #include "SourceCode.h"
 #include "SymbolLocation.h"
 #include "URI.h"
+#include "index/SymbolID.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
@@ -345,43 +346,52 @@ bool SymbolCollector::handleMacroOccurence(const IdentifierInfo *Name,
                                            const MacroInfo *MI,
                                            index::SymbolRoleSet Roles,
                                            SourceLocation Loc) {
-  if (!Opts.CollectMacro)
-    return true;
   assert(PP.get());
 
   const auto &SM = PP->getSourceManager();
   auto DefLoc = MI->getDefinitionLoc();
+  auto SpellingLoc = SM.getSpellingLoc(Loc);
+  bool IsMainFileSymbol = SM.isInMainFile(SM.getExpansionLoc(DefLoc));
 
   // Builtin macros don't have useful locations and aren't needed in completion.
   if (MI->isBuiltinMacro())
     return true;
 
-  // Skip main-file symbols if we are not collecting them.
-  bool IsMainFileSymbol = SM.isInMainFile(SM.getExpansionLoc(DefLoc));
-  if (IsMainFileSymbol && !Opts.CollectMainFileSymbols)
-    return false;
-
   // Also avoid storing predefined macros like __DBL_MIN__.
   if (SM.isWrittenInBuiltinFile(DefLoc))
     return true;
 
+  auto ID = getSymbolID(Name->getName(), MI, SM);
+  if (!ID)
+    return true;
+
+  // Do not store references to main-file macros.
+  if ((static_cast<unsigned>(Opts.RefFilter) & Roles) && !IsMainFileSymbol &&
+      (Opts.RefsInHeaders || SM.getFileID(SpellingLoc) == SM.getMainFileID()))
+    MacroRefs[*ID].push_back({Loc, Roles});
+
+  // Collect symbols.
+  if (!Opts.CollectMacro)
+    return true;
+
+  // Skip main-file macros if we are not collecting them.
+  if (IsMainFileSymbol && !Opts.CollectMainFileSymbols)
+    return false;
+
   // Mark the macro as referenced if this is a reference coming from the main
   // file. The macro may not be an interesting symbol, but it's cheaper to check
   // at the end.
   if (Opts.CountReferences &&
       (Roles & static_cast<unsigned>(index::SymbolRole::Reference)) &&
-      SM.getFileID(SM.getSpellingLoc(Loc)) == SM.getMainFileID())
+      SM.getFileID(SpellingLoc) == SM.getMainFileID())
     ReferencedMacros.insert(Name);
+
   // Don't continue indexing if this is a mere reference.
   // FIXME: remove macro with ID if it is undefined.
   if (!(Roles & static_cast<unsigned>(index::SymbolRole::Declaration) ||
         Roles & static_cast<unsigned>(index::SymbolRole::Definition)))
     return true;
 
-  auto ID = getSymbolID(Name->getName(), MI, SM);
-  if (!ID)
-    return true;
-
   // Only collect one instance in case there are multiple.
   if (Symbols.find(*ID) != nullptr)
     return true;
@@ -485,10 +495,10 @@ void SymbolCollector::finish() {
           IncRef(*ID);
     }
   }
-
   // Fill in IncludeHeaders.
   // We delay this until end of TU so header guards are all resolved.
-  // Symbols in slabs aren' mutable, so insert() has to walk all the strings :-(
+  // Symbols in slabs aren' mutable, so insert() has to walk all the strings
+  // :-(
   llvm::SmallString<256> QName;
   for (const auto &Entry : IncludeFiles)
     if (const Symbol *S = Symbols.find(Entry.first)) {
@@ -518,25 +528,34 @@ void SymbolCollector::finish() {
     }
     return Found->second;
   };
+  auto CollectRef =
+      [&](SymbolID ID,
+          const std::pair<SourceLocation, index::SymbolRoleSet> &LocAndRole) {
+        auto FileID = SM.getFileID(LocAndRole.first);
+        // FIXME: use the result to filter out references.
+        shouldIndexFile(FileID);
+        if (auto FileURI = GetURI(FileID)) {
+          auto Range =
+              getTokenRange(LocAndRole.first, SM, ASTCtx->getLangOpts());
+          Ref R;
+          R.Location.Start = Range.first;
+          R.Location.End = Range.second;
+          R.Location.FileURI = FileURI->c_str();
+          R.Kind = toRefKind(LocAndRole.second);
+          Refs.insert(ID, R);
+        }
+      };
+  // Populate Refs slab from MacroRefs.
+  for (const auto &IDAndRefs : MacroRefs) {
+    for (const auto &LocAndRole : IDAndRefs.second)
+      CollectRef(IDAndRefs.first, LocAndRole);
+  }
   // Populate Refs slab from DeclRefs.
   if (auto MainFileURI = GetURI(SM.getMainFileID())) {
     for (const auto &It : DeclRefs) {
       if (auto ID = getSymbolID(It.first)) {
-        for (const auto &LocAndRole : It.second) {
-          auto FileID = SM.getFileID(LocAndRole.first);
-          // FIXME: use the result to filter out references.
-          shouldIndexFile(FileID);
-          if (auto FileURI = GetURI(FileID)) {
-            auto Range =
-                getTokenRange(LocAndRole.first, SM, ASTCtx->getLangOpts());
-            Ref R;
-            R.Location.Start = Range.first;
-            R.Location.End = Range.second;
-            R.Location.FileURI = FileURI->c_str();
-            R.Kind = toRefKind(LocAndRole.second);
-            Refs.insert(*ID, R);
-          }
-        }
+        for (const auto &LocAndRole : It.second)
+          CollectRef(*ID, LocAndRole);
       }
     }
   }
diff --git a/clang-tools-extra/clangd/index/SymbolCollector.h b/clang-tools-extra/clangd/index/SymbolCollector.h
index 5ad44150b4d56..bc5095d516db8 100644
--- a/clang-tools-extra/clangd/index/SymbolCollector.h
+++ b/clang-tools-extra/clangd/index/SymbolCollector.h
@@ -151,11 +151,12 @@ class SymbolCollector : public index::IndexDataConsumer {
   std::shared_ptr<GlobalCodeCompletionAllocator> CompletionAllocator;
   std::unique_ptr<CodeCompletionTUInfo> CompletionTUInfo;
   Options Opts;
-  using DeclRef = std::pair<SourceLocation, index::SymbolRoleSet>;
+  using SymbolRef = std::pair<SourceLocation, index::SymbolRoleSet>;
   // Symbols referenced from the current TU, flushed on finish().
   llvm::DenseSet<const NamedDecl *> ReferencedDecls;
   llvm::DenseSet<const IdentifierInfo *> ReferencedMacros;
-  llvm::DenseMap<const NamedDecl *, std::vector<DeclRef>> DeclRefs;
+  llvm::DenseMap<const NamedDecl *, std::vector<SymbolRef>> DeclRefs;
+  llvm::DenseMap<SymbolID, std::vector<SymbolRef>> MacroRefs;
   // Maps canonical declaration provided by clang to canonical declaration for
   // an index symbol, if clangd prefers a different declaration than that
   // provided by clang. For example, friend declaration might be considered
diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp
index fb83083384f95..3f3c216c5909c 100644
--- a/clang-tools-extra/clangd/refactor/Rename.cpp
+++ b/clang-tools-extra/clangd/refactor/Rename.cpp
@@ -18,6 +18,9 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Tooling/Refactoring/Rename/USRFindingAction.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 
 namespace clang {
 namespace clangd {
@@ -55,8 +58,7 @@ llvm::Optional<std::string> getOtherRefFile(const Decl &D, StringRef MainFile,
   // tradeoff. We expect the number of symbol references in the current file
   // is smaller than the limit.
   Req.Limit = 100;
-  if (auto ID = getSymbolID(&D))
-    Req.IDs.insert(*ID);
+  Req.IDs.insert(*getSymbolID(&D));
   llvm::Optional<std::string> OtherFile;
   Index.refs(Req, [&](const Ref &R) {
     if (OtherFile)
@@ -83,7 +85,7 @@ llvm::DenseSet<const Decl *> locateDeclAt(ParsedAST &AST,
   // range of the Decl. This would avoid allowing rename on unrelated tokens.
   //   ^class Foo {} // SelectionTree returns CXXRecordDecl,
   //                 // we don't attempt to trigger rename on this position.
-  // FIXME: make this work on destructors, e.g. "~F^oo()".
+  // FIXME: Make this work on destructors, e.g. "~F^oo()".
   if (const auto *D = SelectedNode->ASTNode.get<Decl>()) {
     if (D->getLocation() != TokenStartLoc)
       return {};
@@ -101,71 +103,101 @@ enum ReasonToReject {
   NoSymbolFound,
   NoIndexProvided,
   NonIndexable,
-  UsedOutsideFile,
+  UsedOutsideFile, // for within-file rename only.
   UnsupportedSymbol,
   AmbiguousSymbol,
 };
 
-// Check the symbol Decl is renameable (per the index) within the file.
-llvm::Optional<ReasonToReject> renamableWithinFile(const Decl &RenameDecl,
-                                                   StringRef MainFile,
-                                                   const SymbolIndex *Index) {
+llvm::Optional<ReasonToReject> renameable(const Decl &RenameDecl,
+                                          StringRef MainFilePath,
+                                          const SymbolIndex *Index,
+                                          bool CrossFile) {
+  // Filter out symbols that are unsupported in both rename modes.
   if (llvm::isa<NamespaceDecl>(&RenameDecl))
     return ReasonToReject::UnsupportedSymbol;
   if (const auto *FD = llvm::dyn_cast<FunctionDecl>(&RenameDecl)) {
     if (FD->isOverloadedOperator())
       return ReasonToReject::UnsupportedSymbol;
   }
+  // function-local symbols is safe to rename.
+  if (RenameDecl.getParentFunctionOrMethod())
+    return None;
+
+  // Check whether the symbol being rename is indexable.
   auto &ASTCtx = RenameDecl.getASTContext();
-  const auto &SM = ASTCtx.getSourceManager();
-  bool MainFileIsHeader = isHeaderFile(MainFile, ASTCtx.getLangOpts());
-  bool DeclaredInMainFile = isInsideMainFile(RenameDecl.getBeginLoc(), SM);
+  bool MainFileIsHeader = isHeaderFile(MainFilePath, ASTCtx.getLangOpts());
+  bool DeclaredInMainFile =
+      isInsideMainFile(RenameDecl.getBeginLoc(), ASTCtx.getSourceManager());
+  bool IsMainFileOnly = true;
+  if (MainFileIsHeader)
+    // main file is a header, the symbol can't be main file only.
+    IsMainFileOnly = false;
+  else if (!DeclaredInMainFile)
+    IsMainFileOnly = false;
+  bool IsIndexable =
+      isa<NamedDecl>(RenameDecl) &&
+      SymbolCollector::shouldCollectSymbol(
+          cast<NamedDecl>(RenameDecl), RenameDecl.getASTContext(),
+          SymbolCollector::Options(), IsMainFileOnly);
+  if (!IsIndexable) // If the symbol is not indexable, we disallow rename.
+    return ReasonToReject::NonIndexable;
 
-  if (!DeclaredInMainFile)
-    // We are sure the symbol is used externally, bail out early.
-    return UsedOutsideFile;
+  if (!CrossFile) {
+    if (!DeclaredInMainFile)
+      // We are sure the symbol is used externally, bail out early.
+      return ReasonToReject::UsedOutsideFile;
 
-  // If the symbol is declared in the main file (which is not a header), we
-  // rename it.
-  if (!MainFileIsHeader)
-    return None;
+    // If the symbol is declared in the main file (which is not a header), we
+    // rename it.
+    if (!MainFileIsHeader)
+      return None;
 
-  // Below are cases where the symbol is declared in the header.
-  // If the symbol is function-local, we rename it.
-  if (RenameDecl.getParentFunctionOrMethod())
-    return None;
+    if (!Index)
+      return ReasonToReject::NoIndexProvided;
+
+    auto OtherFile = getOtherRefFile(RenameDecl, MainFilePath, *Index);
+    // If the symbol is indexable and has no refs from other files in the index,
+    // we rename it.
+    if (!OtherFile)
+      return None;
+    // If the symbol is indexable and has refs from other files in the index,
+    // we disallow rename.
+    return ReasonToReject::UsedOutsideFile;
+  }
 
+  assert(CrossFile);
   if (!Index)
     return ReasonToReject::NoIndexProvided;
 
-  bool IsIndexable = isa<NamedDecl>(RenameDecl) &&
-                     SymbolCollector::shouldCollectSymbol(
-                         cast<NamedDecl>(RenameDecl), ASTCtx, {}, false);
-  // If the symbol is not indexable, we disallow rename.
-  if (!IsIndexable)
-    return ReasonToReject::NonIndexable;
-  auto OtherFile = getOtherRefFile(RenameDecl, MainFile, *Index);
-  // If the symbol is indexable and has no refs from other files in the index,
-  // we rename it.
-  if (!OtherFile)
-    return None;
-  // If the symbol is indexable and has refs from other files in the index,
-  // we disallow rename.
-  return ReasonToReject::UsedOutsideFile;
+  // Blacklist symbols that are not supported yet in cross-file mode due to the
+  // limitations of our index.
+  // FIXME: Renaming templates requires to rename all related specializations,
+  // our index doesn't have this information.
+  if (RenameDecl.getDescribedTemplate())
+    return ReasonToReject::UnsupportedSymbol;
+
+  // FIXME: Renaming virtual methods requires to rename all overridens in
+  // subclasses, our index doesn't have this information.
+  // Note: Within-file rename does support this through the AST.
+  if (const auto *S = llvm::dyn_cast<CXXMethodDecl>(&RenameDecl)) {
+    if (S->isVirtual())
+      return ReasonToReject::UnsupportedSymbol;
+  }
+  return None;
 }
 
 llvm::Error makeError(ReasonToReject Reason) {
   auto Message = [](ReasonToReject Reason) {
     switch (Reason) {
-    case NoSymbolFound:
+    case ReasonToReject::NoSymbolFound:
       return "there is no symbol at the given location";
-    case NoIndexProvided:
-      return "symbol may be used in other files (no index available)";
-    case UsedOutsideFile:
+    case ReasonToReject::NoIndexProvided:
+      return "no index provided";
+    case ReasonToReject::UsedOutsideFile:
       return "the symbol is used outside main file";
-    case NonIndexable:
+    case ReasonToReject::NonIndexable:
       return "symbol may be used in other files (not eligible for indexing)";
-    case UnsupportedSymbol:
+    case ReasonToReject::UnsupportedSymbol:
       return "symbol is not a supported kind (e.g. namespace, macro)";
     case AmbiguousSymbol:
       return "there are multiple symbols at the given location";
@@ -188,7 +220,7 @@ std::vector<SourceLocation> findOccurrencesWithinFile(ParsedAST &AST,
       ND.getDescribedTemplate() ? *ND.getDescribedTemplate() : ND;
   // getUSRsForDeclaration will find other related symbols, e.g. virtual and its
   // overriddens, primary template and all explicit specializations.
-  // FIXME: get rid of the remaining tooling APIs.
+  // FIXME: Get rid of the remaining tooling APIs.
   std::vector<std::string> RenameUSRs = tooling::getUSRsForDeclaration(
       tooling::getCanonicalSymbolDeclaration(&RenameDecl), AST.getASTContext());
   llvm::DenseSet<SymbolID> TargetIDs;
@@ -212,35 +244,14 @@ std::vector<SourceLocation> findOccurrencesWithinFile(ParsedAST &AST,
   return Results;
 }
 
-} // namespace
-
+// AST-based rename, it renames all occurrences in the main file.
 llvm::Expected<tooling::Replacements>
-renameWithinFile(ParsedAST &AST, llvm::StringRef File, Position Pos,
-                 llvm::StringRef NewName, const SymbolIndex *Index) {
+renameWithinFile(ParsedAST &AST, const NamedDecl &RenameDecl,
+                 llvm::StringRef NewName) {
   const SourceManager &SM = AST.getSourceManager();
-  SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
-      getBeginningOfIdentifier(Pos, SM, AST.getASTContext().getLangOpts()));
-  // FIXME: renaming macros is not supported yet, the macro-handling code should
-  // be moved to rename tooling library.
-  if (locateMacroAt(SourceLocationBeg, AST.getPreprocessor()))
-    return makeError(UnsupportedSymbol);
-
-  auto DeclsUnderCursor = locateDeclAt(AST, SourceLocationBeg);
-  if (DeclsUnderCursor.empty())
-    return makeError(NoSymbolFound);
-  if (DeclsUnderCursor.size() > 1)
-    return makeError(AmbiguousSymbol);
-
-  const auto *RenameDecl = llvm::dyn_cast<NamedDecl>(*DeclsUnderCursor.begin());
-  if (!RenameDecl)
-    return makeError(UnsupportedSymbol);
-
-  if (auto Reject =
-          renamableWithinFile(*RenameDecl->getCanonicalDecl(), File, Index))
-    return makeError(*Reject);
 
   tooling::Replacements FilteredChanges;
-  for (SourceLocation Loc : findOccurrencesWithinFile(AST, *RenameDecl)) {
+  for (SourceLocation Loc : findOccurrencesWithinFile(AST, RenameDecl)) {
     SourceLocation RenameLoc = Loc;
     // We don't rename in any macro bodies, but we allow rename the symbol
     // spelled in a top-level macro argument in the main file.
@@ -265,5 +276,233 @@ renameWithinFile(ParsedAST &AST, llvm::StringRef File, Position Pos,
   return FilteredChanges;
 }
 
+Range toRange(const SymbolLocation &L) {
+  Range R;
+  R.start.line = L.Start.line();
+  R.start.character = L.Start.column();
+  R.end.line = L.End.line();
+  R.end.character = L.End.column();
+  return R;
+}
+
+// Return all rename occurrences (using the index) outside of the main file,
+// grouped by the absolute file path.
+llvm::Expected<llvm::StringMap<std::vector<Range>>>
+findOccurrencesOutsideFile(const NamedDecl &RenameDecl,
+                           llvm::StringRef MainFile, const SymbolIndex &Index) {
+  RefsRequest RQuest;
+  RQuest.IDs.insert(*getSymbolID(&RenameDecl));
+
+  // Absolute file path => rename occurrences in that file.
+  llvm::StringMap<std::vector<Range>> AffectedFiles;
+  // FIXME: Make the limit customizable.
+  static constexpr size_t MaxLimitFiles = 50;
+  bool HasMore = Index.refs(RQuest, [&](const Ref &R) {
+    if (AffectedFiles.size() > MaxLimitFiles)
+      return;
+    if (auto RefFilePath = filePath(R.Location, /*HintFilePath=*/MainFile)) {
+      if (*RefFilePath != MainFile)
+        AffectedFiles[*RefFilePath].push_back(toRange(R.Location));
+    }
+  });
+
+  if (AffectedFiles.size() > MaxLimitFiles)
+    return llvm::make_error<llvm::StringError>(
+        llvm::formatv("The number of affected files exceeds the max limit {0}",
+                      MaxLimitFiles),
+        llvm::inconvertibleErrorCode());
+  if (HasMore) {
+    return llvm::make_error<llvm::StringError>(
+        llvm::formatv("The symbol {0} has too many occurrences",
+                      RenameDecl.getQualifiedNameAsString()),
+        llvm::inconvertibleErrorCode());
+  }
+
+  return AffectedFiles;
+}
+
+// Index-based rename, it renames all occurrences outside of the main file.
+//
+// The cross-file rename is purely based on the index, as we don't want to
+// build all ASTs for affected files, which may cause a performance hit.
+// We choose to trade off some correctness for performance and scalability.
+//
+// Clangd builds a dynamic index for all opened files on top of the static
+// index of the whole codebase. Dynamic index is up-to-date (respects dirty
+// buffers) as long as clangd finishes processing opened files, while static
+// index (background index) is relatively stale. We choose the dirty buffers
+// as the file content we rename on, and fallback to file content on disk if
+// there is no dirty buffer.
+//
+// FIXME: Add range patching heuristics to detect staleness of the index, and
+// report to users.
+// FIXME: Our index may return implicit references, which are not eligible for
+// rename, we should filter out these references.
+llvm::Expected<FileEdits> renameOutsideFile(
+    const NamedDecl &RenameDecl, llvm::StringRef MainFilePath,
+    llvm::StringRef NewName, const SymbolIndex &Index,
+    llvm::function_ref<llvm::Expected<std::string>(PathRef)> GetFileContent) {
+  auto AffectedFiles =
+      findOccurrencesOutsideFile(RenameDecl, MainFilePath, Index);
+  if (!AffectedFiles)
+    return AffectedFiles.takeError();
+  FileEdits Results;
+  for (auto &FileAndOccurrences : *AffectedFiles) {
+    llvm::StringRef FilePath = FileAndOccurrences.first();
+
+    auto AffectedFileCode = GetFileContent(FilePath);
+    if (!AffectedFileCode) {
+      elog("Fail to read file content: {0}", AffectedFileCode.takeError());
+      continue;
+    }
+    auto RenameEdit =
+        buildRenameEdit(FilePath, *AffectedFileCode,
+                        std::move(FileAndOccurrences.second), NewName);
+    if (!RenameEdit) {
+      return llvm::make_error<llvm::StringError>(
+          llvm::formatv("fail to build rename edit for file {0}: {1}", FilePath,
+                        llvm::toString(RenameEdit.takeError())),
+          llvm::inconvertibleErrorCode());
+    }
+    if (!RenameEdit->Replacements.empty())
+      Results.insert({FilePath, std::move(*RenameEdit)});
+  }
+  return Results;
+}
+
+} // namespace
+
+llvm::Expected<FileEdits> rename(const RenameInputs &RInputs) {
+  ParsedAST &AST = RInputs.AST;
+  const SourceManager &SM = AST.getSourceManager();
+  llvm::StringRef MainFileCode = SM.getBufferData(SM.getMainFileID());
+  auto GetFileContent = [&RInputs,
+                         &SM](PathRef AbsPath) -> llvm::Expected<std::string> {
+    llvm::Optional<std::string> DirtyBuffer;
+    if (RInputs.GetDirtyBuffer &&
+        (DirtyBuffer = RInputs.GetDirtyBuffer(AbsPath)))
+      return std::move(*DirtyBuffer);
+
+    auto Content =
+        SM.getFileManager().getVirtualFileSystem().getBufferForFile(AbsPath);
+    if (!Content)
+      return llvm::createStringError(
+          llvm::inconvertibleErrorCode(),
+          llvm::formatv("Fail to open file {0}: {1}", AbsPath,
+                        Content.getError().message()));
+    if (!*Content)
+      return llvm::createStringError(
+          llvm::inconvertibleErrorCode(),
+          llvm::formatv("Got no buffer for file {0}", AbsPath));
+
+    return (*Content)->getBuffer().str();
+  };
+  SourceLocation SourceLocationBeg = SM.getMacroArgExpandedLocation(
+      getBeginningOfIdentifier(RInputs.Pos, SM, AST.getLangOpts()));
+  // FIXME: Renaming macros is not supported yet, the macro-handling code should
+  // be moved to rename tooling library.
+  if (locateMacroAt(SourceLocationBeg, AST.getPreprocessor()))
+    return makeError(ReasonToReject::UnsupportedSymbol);
+
+  auto DeclsUnderCursor = locateDeclAt(AST, SourceLocationBeg);
+  if (DeclsUnderCursor.empty())
+    return makeError(ReasonToReject::NoSymbolFound);
+  if (DeclsUnderCursor.size() > 1)
+    return makeError(ReasonToReject::AmbiguousSymbol);
+
+  const auto *RenameDecl = llvm::dyn_cast<NamedDecl>(*DeclsUnderCursor.begin());
+  if (!RenameDecl)
+    return makeError(ReasonToReject::UnsupportedSymbol);
+
+  auto Reject =
+      renameable(*RenameDecl->getCanonicalDecl(), RInputs.MainFilePath,
+                 RInputs.Index, RInputs.AllowCrossFile);
+  if (Reject)
+    return makeError(*Reject);
+
+  // We have two implementations of the rename:
+  //   - AST-based rename: used for renaming local symbols, e.g. variables
+  //     defined in a function body;
+  //   - index-based rename: used for renaming non-local symbols, and not
+  //     feasible for local symbols (as by design our index don't index these
+  //     symbols by design;
+  // To make cross-file rename work for local symbol, we use a hybrid solution:
+  //   - run AST-based rename on the main file;
+  //   - run index-based rename on other affected files;
+  auto MainFileRenameEdit = renameWithinFile(AST, *RenameDecl, RInputs.NewName);
+  if (!MainFileRenameEdit)
+    return MainFileRenameEdit.takeError();
+
+  if (!RInputs.AllowCrossFile) {
+    // Within-file rename: just return the main file results.
+    return FileEdits(
+        {std::make_pair(RInputs.MainFilePath,
+                        Edit{MainFileCode, std::move(*MainFileRenameEdit)})});
+  }
+
+  FileEdits Results;
+  // Renameable safely guards us that at this point we are renaming a local
+  // symbol if we don't have index.
+  if (RInputs.Index) {
+    auto OtherFilesEdits =
+        renameOutsideFile(*RenameDecl, RInputs.MainFilePath, RInputs.NewName,
+                          *RInputs.Index, GetFileContent);
+    if (!OtherFilesEdits)
+      return OtherFilesEdits.takeError();
+    Results = std::move(*OtherFilesEdits);
+  }
+  // Attach the rename edits for the main file.
+  Results.try_emplace(RInputs.MainFilePath, MainFileCode,
+                      std::move(*MainFileRenameEdit));
+  return Results;
+}
+
+llvm::Expected<Edit> buildRenameEdit(llvm::StringRef AbsFilePath,
+                                     llvm::StringRef InitialCode,
+                                     std::vector<Range> Occurrences,
+                                     llvm::StringRef NewName) {
+  llvm::sort(Occurrences);
+  // These two always correspond to the same position.
+  Position LastPos{0, 0};
+  size_t LastOffset = 0;
+
+  auto Offset = [&](const Position &P) -> llvm::Expected<size_t> {
+    assert(LastPos <= P && "malformed input");
+    Position Shifted = {
+        P.line - LastPos.line,
+        P.line > LastPos.line ? P.character : P.character - LastPos.character};
+    auto ShiftedOffset =
+        positionToOffset(InitialCode.substr(LastOffset), Shifted);
+    if (!ShiftedOffset)
+      return llvm::make_error<llvm::StringError>(
+          llvm::formatv("fail to convert the position {0} to offset ({1})", P,
+                        llvm::toString(ShiftedOffset.takeError())),
+          llvm::inconvertibleErrorCode());
+    LastPos = P;
+    LastOffset += *ShiftedOffset;
+    return LastOffset;
+  };
+
+  std::vector<std::pair</*start*/ size_t, /*end*/ size_t>> OccurrencesOffsets;
+  for (const auto &R : Occurrences) {
+    auto StartOffset = Offset(R.start);
+    if (!StartOffset)
+      return StartOffset.takeError();
+    auto EndOffset = Offset(R.end);
+    if (!EndOffset)
+      return EndOffset.takeError();
+    OccurrencesOffsets.push_back({*StartOffset, *EndOffset});
+  }
+
+  tooling::Replacements RenameEdit;
+  for (const auto &R : OccurrencesOffsets) {
+    auto ByteLength = R.second - R.first;
+    if (auto Err = RenameEdit.add(
+            tooling::Replacement(AbsFilePath, R.first, ByteLength, NewName)))
+      return std::move(Err);
+  }
+  return Edit(InitialCode, std::move(RenameEdit));
+}
+
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/refactor/Rename.h b/clang-tools-extra/clangd/refactor/Rename.h
index 63a1ffe321508..6f38c14a3e2a8 100644
--- a/clang-tools-extra/clangd/refactor/Rename.h
+++ b/clang-tools-extra/clangd/refactor/Rename.h
@@ -9,7 +9,9 @@
 #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_REFACTOR_RENAME_H
 #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_REFACTOR_RENAME_H
 
+#include "Path.h"
 #include "Protocol.h"
+#include "SourceCode.h"
 #include "clang/Tooling/Core/Replacement.h"
 #include "llvm/Support/Error.h"
 
@@ -18,13 +20,40 @@ namespace clangd {
 class ParsedAST;
 class SymbolIndex;
 
-/// Renames all occurrences of the symbol at \p Pos to \p NewName.
-/// Occurrences outside the current file are not modified.
-/// Returns an error if rename a symbol that's used in another file (per the
-/// index).
-llvm::Expected<tooling::Replacements>
-renameWithinFile(ParsedAST &AST, llvm::StringRef File, Position Pos,
-                 llvm::StringRef NewName, const SymbolIndex *Index = nullptr);
+/// Gets dirty buffer for a given file \p AbsPath.
+/// Returns None if there is no dirty buffer for the given file.
+using DirtyBufferGetter =
+    llvm::function_ref<llvm::Optional<std::string>(PathRef AbsPath)>;
+
+struct RenameInputs {
+  Position Pos; // the position triggering the rename
+  llvm::StringRef NewName;
+
+  ParsedAST &AST;
+  llvm::StringRef MainFilePath;
+
+  const SymbolIndex *Index = nullptr;
+
+  bool AllowCrossFile = false;
+  // When set, used by the rename to get file content for all rename-related
+  // files.
+  // If there is no corresponding dirty buffer, we will use the file content
+  // from disk.
+  DirtyBufferGetter GetDirtyBuffer = nullptr;
+};
+
+/// Renames all occurrences of the symbol.
+/// If AllowCrossFile is false, returns an error if rename a symbol that's used
+/// in another file (per the index).
+llvm::Expected<FileEdits> rename(const RenameInputs &RInputs);
+
+/// Generates rename edits that replaces all given occurrences with the
+/// NewName.
+/// Exposed for testing only.
+llvm::Expected<Edit> buildRenameEdit(llvm::StringRef AbsFilePath,
+                                     llvm::StringRef InitialCode,
+                                     std::vector<Range> Occurrences,
+                                     llvm::StringRef NewName);
 
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/clangd/refactor/Tweak.h b/clang-tools-extra/clangd/refactor/Tweak.h
index de655abd98c7b..69ac4ad612e9d 100644
--- a/clang-tools-extra/clangd/refactor/Tweak.h
+++ b/clang-tools-extra/clangd/refactor/Tweak.h
@@ -77,9 +77,7 @@ class Tweak {
   struct Effect {
     /// A message to be displayed to the user.
     llvm::Optional<std::string> ShowMessage;
-    /// A mapping from file path(the one used for accessing the underlying VFS)
-    /// to edits.
-    llvm::StringMap<Edit> ApplyEdits;
+    FileEdits ApplyEdits;
 
     static Effect showMessage(StringRef S) {
       Effect E;
diff --git a/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt b/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
index ddf10a2ca2bac..6f6ef4a2ace23 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
+++ b/clang-tools-extra/clangd/refactor/tweaks/CMakeLists.txt
@@ -15,10 +15,12 @@ add_clang_library(clangDaemonTweaks OBJECT
   AnnotateHighlightings.cpp
   DumpAST.cpp
   DefineInline.cpp
+  DefineOutline.cpp
   ExpandAutoType.cpp
   ExpandMacro.cpp
   ExtractFunction.cpp
   ExtractVariable.cpp
+  ObjCLocalizeStringLiteral.cpp
   RawStringLiteral.cpp
   RemoveUsingNamespace.cpp
   SwapIfBranches.cpp
diff --git a/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
new file mode 100644
index 0000000000000..f6bed9727cf10
--- /dev/null
+++ b/clang-tools-extra/clangd/refactor/tweaks/DefineOutline.cpp
@@ -0,0 +1,330 @@
+//===--- DefineOutline.cpp ---------------------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AST.h"
+#include "FindTarget.h"
+#include "HeaderSourceSwitch.h"
+#include "Logger.h"
+#include "Path.h"
+#include "Selection.h"
+#include "SourceCode.h"
+#include "refactor/Tweak.h"
+#include "clang/AST/ASTTypeTraits.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/Stmt.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Driver/Types.h"
+#include "clang/Format/Format.h"
+#include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include <cstddef>
+#include <string>
+
+namespace clang {
+namespace clangd {
+namespace {
+
+// Deduces the FunctionDecl from a selection. Requires either the function body
+// or the function decl to be selected. Returns null if none of the above
+// criteria is met.
+// FIXME: This is shared with define inline, move them to a common header once
+// we have a place for such.
+const FunctionDecl *getSelectedFunction(const SelectionTree::Node *SelNode) {
+  if (!SelNode)
+    return nullptr;
+  const ast_type_traits::DynTypedNode &AstNode = SelNode->ASTNode;
+  if (const FunctionDecl *FD = AstNode.get<FunctionDecl>())
+    return FD;
+  if (AstNode.get<CompoundStmt>() &&
+      SelNode->Selected == SelectionTree::Complete) {
+    if (const SelectionTree::Node *P = SelNode->Parent)
+      return P->ASTNode.get<FunctionDecl>();
+  }
+  return nullptr;
+}
+
+llvm::Optional<Path> getSourceFile(llvm::StringRef FileName,
+                                   const Tweak::Selection &Sel) {
+  if (auto Source = getCorrespondingHeaderOrSource(
+          FileName,
+          &Sel.AST.getSourceManager().getFileManager().getVirtualFileSystem()))
+    return *Source;
+  return getCorrespondingHeaderOrSource(FileName, Sel.AST, Sel.Index);
+}
+
+// Synthesize a DeclContext for TargetNS from CurContext. TargetNS must be empty
+// for global namespace, and endwith "::" otherwise.
+// Returns None if TargetNS is not a prefix of CurContext.
+llvm::Optional<const DeclContext *>
+findContextForNS(llvm::StringRef TargetNS, const DeclContext *CurContext) {
+  assert(TargetNS.empty() || TargetNS.endswith("::"));
+  // Skip any non-namespace contexts, e.g. TagDecls, functions/methods.
+  CurContext = CurContext->getEnclosingNamespaceContext();
+  // If TargetNS is empty, it means global ns, which is translation unit.
+  if (TargetNS.empty()) {
+    while (!CurContext->isTranslationUnit())
+      CurContext = CurContext->getParent();
+    return CurContext;
+  }
+  // Otherwise we need to drop any trailing namespaces from CurContext until
+  // we reach TargetNS.
+  std::string TargetContextNS =
+      CurContext->isNamespace()
+          ? llvm::cast<NamespaceDecl>(CurContext)->getQualifiedNameAsString()
+          : "";
+  TargetContextNS.append("::");
+
+  llvm::StringRef CurrentContextNS(TargetContextNS);
+  // If TargetNS is not a prefix of CurrentContext, there's no way to reach
+  // it.
+  if (!CurrentContextNS.startswith(TargetNS))
+    return llvm::None;
+
+  while (CurrentContextNS != TargetNS) {
+    CurContext = CurContext->getParent();
+    // These colons always exists since TargetNS is a prefix of
+    // CurrentContextNS, it ends with "::" and they are not equal.
+    CurrentContextNS = CurrentContextNS.take_front(
+        CurrentContextNS.drop_back(2).rfind("::") + 2);
+  }
+  return CurContext;
+}
+
+// Returns source code for FD after applying Replacements.
+// FIXME: Make the function take a parameter to return only the function body,
+// afterwards it can be shared with define-inline code action.
+llvm::Expected<std::string>
+getFunctionSourceAfterReplacements(const FunctionDecl *FD,
+                                   const tooling::Replacements &Replacements) {
+  const auto &SM = FD->getASTContext().getSourceManager();
+  auto OrigFuncRange = toHalfOpenFileRange(
+      SM, FD->getASTContext().getLangOpts(), FD->getSourceRange());
+  if (!OrigFuncRange)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Couldn't get range for function.");
+  // Include template parameter list.
+  if (auto *FTD = FD->getDescribedFunctionTemplate())
+    OrigFuncRange->setBegin(FTD->getBeginLoc());
+
+  // Get new begin and end positions for the qualified function definition.
+  unsigned FuncBegin = SM.getFileOffset(OrigFuncRange->getBegin());
+  unsigned FuncEnd = Replacements.getShiftedCodePosition(
+      SM.getFileOffset(OrigFuncRange->getEnd()));
+
+  // Trim the result to function definition.
+  auto QualifiedFunc = tooling::applyAllReplacements(
+      SM.getBufferData(SM.getMainFileID()), Replacements);
+  if (!QualifiedFunc)
+    return QualifiedFunc.takeError();
+  return QualifiedFunc->substr(FuncBegin, FuncEnd - FuncBegin + 1);
+}
+
+// Creates a modified version of function definition that can be inserted at a
+// different location, qualifies return value and function name to achieve that.
+// Contains function signature, body and template parameters if applicable.
+// No need to qualify parameters, as they are looked up in the context
+// containing the function/method.
+llvm::Expected<std::string>
+getFunctionSourceCode(const FunctionDecl *FD, llvm::StringRef TargetNamespace) {
+  auto &SM = FD->getASTContext().getSourceManager();
+  auto TargetContext = findContextForNS(TargetNamespace, FD->getDeclContext());
+  if (!TargetContext)
+    return llvm::createStringError(
+        llvm::inconvertibleErrorCode(),
+        "define outline: couldn't find a context for target");
+
+  llvm::Error Errors = llvm::Error::success();
+  tooling::Replacements QualifierInsertions;
+
+  // Finds the first unqualified name in function return type and name, then
+  // qualifies those to be valid in TargetContext.
+  findExplicitReferences(FD, [&](ReferenceLoc Ref) {
+    // It is enough to qualify the first qualifier, so skip references with a
+    // qualifier. Also we can't do much if there are no targets or name is
+    // inside a macro body.
+    if (Ref.Qualifier || Ref.Targets.empty() || Ref.NameLoc.isMacroID())
+      return;
+    // Only qualify return type and function name.
+    if (Ref.NameLoc != FD->getReturnTypeSourceRange().getBegin() &&
+        Ref.NameLoc != FD->getLocation())
+      return;
+
+    for (const NamedDecl *ND : Ref.Targets) {
+      if (ND->getDeclContext() != Ref.Targets.front()->getDeclContext()) {
+        elog("Targets from multiple contexts: {0}, {1}",
+             printQualifiedName(*Ref.Targets.front()), printQualifiedName(*ND));
+        return;
+      }
+    }
+    const NamedDecl *ND = Ref.Targets.front();
+    const std::string Qualifier =
+        getQualification(FD->getASTContext(), *TargetContext,
+                         SM.getLocForStartOfFile(SM.getMainFileID()), ND);
+    if (auto Err = QualifierInsertions.add(
+            tooling::Replacement(SM, Ref.NameLoc, 0, Qualifier)))
+      Errors = llvm::joinErrors(std::move(Errors), std::move(Err));
+  });
+
+  if (Errors)
+    return std::move(Errors);
+  return getFunctionSourceAfterReplacements(FD, QualifierInsertions);
+}
+
+struct InsertionPoint {
+  std::string EnclosingNamespace;
+  size_t Offset;
+};
+// Returns the most natural insertion point for \p QualifiedName in \p Contents.
+// This currently cares about only the namespace proximity, but in feature it
+// should also try to follow ordering of declarations. For example, if decls
+// come in order `foo, bar, baz` then this function should return some point
+// between foo and baz for inserting bar.
+llvm::Expected<InsertionPoint>
+getInsertionPoint(llvm::StringRef Contents, llvm::StringRef QualifiedName,
+                  const format::FormatStyle &Style) {
+  auto Region = getEligiblePoints(Contents, QualifiedName, Style);
+
+  assert(!Region.EligiblePoints.empty());
+  // FIXME: This selection can be made smarter by looking at the definition
+  // locations for adjacent decls to Source. Unfortunately psudeo parsing in
+  // getEligibleRegions only knows about namespace begin/end events so we
+  // can't match function start/end positions yet.
+  auto Offset = positionToOffset(Contents, Region.EligiblePoints.back());
+  if (!Offset)
+    return Offset.takeError();
+  return InsertionPoint{Region.EnclosingNamespace, *Offset};
+}
+
+/// Moves definition of a function/method to an appropriate implementation file.
+///
+/// Before:
+/// a.h
+///   void foo() { return; }
+/// a.cc
+///   #include "a.h"
+///
+/// ----------------
+///
+/// After:
+/// a.h
+///   void foo();
+/// a.cc
+///   #include "a.h"
+///   void foo() { return; }
+class DefineOutline : public Tweak {
+public:
+  const char *id() const override;
+
+  bool hidden() const override { return true; }
+  Intent intent() const override { return Intent::Refactor; }
+  std::string title() const override {
+    return "Move function body to out-of-line.";
+  }
+
+  bool prepare(const Selection &Sel) override {
+    // Bail out if we are not in a header file.
+    // FIXME: We might want to consider moving method definitions below class
+    // definition even if we are inside a source file.
+    if (!isHeaderFile(Sel.AST.getSourceManager().getFilename(Sel.Cursor),
+                      Sel.AST.getLangOpts()))
+      return false;
+
+    Source = getSelectedFunction(Sel.ASTSelection.commonAncestor());
+    // Bail out if the selection is not a in-line function definition.
+    if (!Source || !Source->doesThisDeclarationHaveABody() ||
+        Source->isOutOfLine())
+      return false;
+
+    // Bail out in templated classes, as it is hard to spell the class name, i.e
+    // if the template parameter is unnamed.
+    if (auto *MD = llvm::dyn_cast<CXXMethodDecl>(Source)) {
+      if (MD->getParent()->isTemplated())
+        return false;
+    }
+
+    // Note that we don't check whether an implementation file exists or not in
+    // the prepare, since performing disk IO on each prepare request might be
+    // expensive.
+    return true;
+  }
+
+  Expected<Effect> apply(const Selection &Sel) override {
+    const SourceManager &SM = Sel.AST.getSourceManager();
+    auto MainFileName =
+        getCanonicalPath(SM.getFileEntryForID(SM.getMainFileID()), SM);
+    if (!MainFileName)
+      return llvm::createStringError(
+          llvm::inconvertibleErrorCode(),
+          "Couldn't get absolute path for mainfile.");
+
+    auto CCFile = getSourceFile(*MainFileName, Sel);
+    if (!CCFile)
+      return llvm::createStringError(
+          llvm::inconvertibleErrorCode(),
+          "Couldn't find a suitable implementation file.");
+
+    auto &FS =
+        Sel.AST.getSourceManager().getFileManager().getVirtualFileSystem();
+    auto Buffer = FS.getBufferForFile(*CCFile);
+    // FIXME: Maybe we should consider creating the implementation file if it
+    // doesn't exist?
+    if (!Buffer)
+      return llvm::createStringError(Buffer.getError(),
+                                     Buffer.getError().message());
+    auto Contents = Buffer->get()->getBuffer();
+    auto InsertionPoint =
+        getInsertionPoint(Contents, Source->getQualifiedNameAsString(),
+                          getFormatStyleForFile(*CCFile, Contents, &FS));
+    if (!InsertionPoint)
+      return InsertionPoint.takeError();
+
+    auto FuncDef =
+        getFunctionSourceCode(Source, InsertionPoint->EnclosingNamespace);
+    if (!FuncDef)
+      return FuncDef.takeError();
+
+    SourceManagerForFile SMFF(*CCFile, Contents);
+    const tooling::Replacement InsertFunctionDef(
+        *CCFile, InsertionPoint->Offset, 0, *FuncDef);
+    auto Effect = Effect::mainFileEdit(
+        SMFF.get(), tooling::Replacements(InsertFunctionDef));
+    if (!Effect)
+      return Effect.takeError();
+
+    // FIXME: We should also get rid of inline qualifier.
+    const tooling::Replacement DeleteFuncBody(
+        Sel.AST.getSourceManager(),
+        CharSourceRange::getTokenRange(*toHalfOpenFileRange(
+            SM, Sel.AST.getLangOpts(), Source->getBody()->getSourceRange())),
+        ";");
+    auto HeaderFE = Effect::fileEdit(SM, SM.getMainFileID(),
+                                     tooling::Replacements(DeleteFuncBody));
+    if (!HeaderFE)
+      return HeaderFE.takeError();
+
+    Effect->ApplyEdits.try_emplace(HeaderFE->first,
+                                   std::move(HeaderFE->second));
+    return std::move(*Effect);
+  }
+
+private:
+  const FunctionDecl *Source = nullptr;
+};
+
+REGISTER_TWEAK(DefineOutline);
+
+} // namespace
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp b/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp
index 1551f41a13184..ce9addb293bf9 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/ExtractFunction.cpp
@@ -645,7 +645,7 @@ tooling::Replacement createFunctionDefinition(const NewFunction &ExtractedFunc,
 bool ExtractFunction::prepare(const Selection &Inputs) {
   const Node *CommonAnc = Inputs.ASTSelection.commonAncestor();
   const SourceManager &SM = Inputs.AST.getSourceManager();
-  const LangOptions &LangOpts = Inputs.AST.getASTContext().getLangOpts();
+  const LangOptions &LangOpts = Inputs.AST.getLangOpts();
   if (auto MaybeExtZone = findExtractionZone(CommonAnc, SM, LangOpts)) {
     ExtZone = std::move(*MaybeExtZone);
     return true;
@@ -655,7 +655,7 @@ bool ExtractFunction::prepare(const Selection &Inputs) {
 
 Expected<Tweak::Effect> ExtractFunction::apply(const Selection &Inputs) {
   const SourceManager &SM = Inputs.AST.getSourceManager();
-  const LangOptions &LangOpts = Inputs.AST.getASTContext().getLangOpts();
+  const LangOptions &LangOpts = Inputs.AST.getLangOpts();
   auto ExtractedFunc = getExtractedFunction(ExtZone, SM, LangOpts);
   // FIXME: Add more types of errors.
   if (!ExtractedFunc)
diff --git a/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp b/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp
new file mode 100644
index 0000000000000..62d0c6a2d20c6
--- /dev/null
+++ b/clang-tools-extra/clangd/refactor/tweaks/ObjCLocalizeStringLiteral.cpp
@@ -0,0 +1,85 @@
+//===--- ObjcLocalizeStringLiteral.cpp ---------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Logger.h"
+#include "ParsedAST.h"
+#include "SourceCode.h"
+#include "refactor/Tweak.h"
+#include "clang/AST/ExprObjC.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Tooling/Core/Replacement.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+
+namespace clang {
+namespace clangd {
+namespace {
+
+/// Wraps an Objective-C string literal with the NSLocalizedString macro.
+/// Before:
+///   @"description"
+///   ^^^
+/// After:
+///   NSLocalizedString(@"description", @"")
+class ObjCLocalizeStringLiteral : public Tweak {
+public:
+  const char *id() const override final;
+  Intent intent() const override { return Intent::Refactor; }
+
+  bool prepare(const Selection &Inputs) override;
+  Expected<Tweak::Effect> apply(const Selection &Inputs) override;
+  std::string title() const override;
+
+private:
+  const clang::ObjCStringLiteral *Str = nullptr;
+};
+
+REGISTER_TWEAK(ObjCLocalizeStringLiteral)
+
+bool ObjCLocalizeStringLiteral::prepare(const Selection &Inputs) {
+  const SelectionTree::Node *N = Inputs.ASTSelection.commonAncestor();
+  if (!N)
+    return false;
+  // Allow the refactoring even if the user selected only the C string part
+  // of the expression.
+  if (N->ASTNode.get<StringLiteral>()) {
+    if (N->Parent)
+      N = N->Parent;
+  }
+  Str = dyn_cast_or_null<ObjCStringLiteral>(N->ASTNode.get<Stmt>());
+  return Str;
+}
+
+Expected<Tweak::Effect>
+ObjCLocalizeStringLiteral::apply(const Selection &Inputs) {
+  auto &SM = Inputs.AST.getSourceManager();
+  auto &LangOpts = Inputs.AST.getASTContext().getLangOpts();
+  auto Reps = tooling::Replacements(tooling::Replacement(
+      SM, CharSourceRange::getCharRange(Str->getBeginLoc()),
+      "NSLocalizedString(", LangOpts));
+  SourceLocation EndLoc = Lexer::getLocForEndOfToken(
+      Str->getEndLoc(), 0, Inputs.AST.getSourceManager(), LangOpts);
+  if (auto Err = Reps.add(tooling::Replacement(
+          SM, CharSourceRange::getCharRange(EndLoc), ", @\"\")", LangOpts)))
+    return std::move(Err);
+  return Effect::mainFileEdit(SM, std::move(Reps));
+}
+
+std::string ObjCLocalizeStringLiteral::title() const {
+  return "Wrap in NSLocalizedString";
+}
+
+} // namespace
+} // namespace clangd
+} // namespace clang
diff --git a/clang-tools-extra/clangd/refactor/tweaks/RawStringLiteral.cpp b/clang-tools-extra/clangd/refactor/tweaks/RawStringLiteral.cpp
index 42d0122b33824..2d4bf755f64f5 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/RawStringLiteral.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/RawStringLiteral.cpp
@@ -91,7 +91,7 @@ Expected<Tweak::Effect> RawStringLiteral::apply(const Selection &Inputs) {
   auto &SM = Inputs.AST.getSourceManager();
   auto Reps = tooling::Replacements(
       tooling::Replacement(SM, Str, ("R\"(" + Str->getBytes() + ")\"").str(),
-                           Inputs.AST.getASTContext().getLangOpts()));
+                           Inputs.AST.getLangOpts()));
   return Effect::mainFileEdit(SM, std::move(Reps));
 }
 
diff --git a/clang-tools-extra/clangd/test/exit-eof.test b/clang-tools-extra/clangd/test/exit-eof.test
new file mode 100644
index 0000000000000..06d2ea87ff480
--- /dev/null
+++ b/clang-tools-extra/clangd/test/exit-eof.test
@@ -0,0 +1,7 @@
+# RUN: not clangd -sync < %s 2> %t.err
+# RUN: FileCheck %s < %t.err
+#
+# No LSP messages here, just let clangd see the end-of-file
+# CHECK: Transport error:
+# (Typically "Transport error: Input/output error" but platform-dependent).
+
diff --git a/clang-tools-extra/clangd/tool/ClangdMain.cpp b/clang-tools-extra/clangd/tool/ClangdMain.cpp
index 2639df31dbe8d..b8385a0c9e5d5 100644
--- a/clang-tools-extra/clangd/tool/ClangdMain.cpp
+++ b/clang-tools-extra/clangd/tool/ClangdMain.cpp
@@ -11,6 +11,7 @@
 #include "Features.inc"
 #include "Path.h"
 #include "Protocol.h"
+#include "Shutdown.h"
 #include "Trace.h"
 #include "Transport.h"
 #include "index/Background.h"
@@ -35,6 +36,10 @@
 #include <string>
 #include <thread>
 
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
 namespace clang {
 namespace clangd {
 namespace {
@@ -264,6 +269,16 @@ list<std::string> TweakList{
     CommaSeparated,
 };
 
+opt<bool> CrossFileRename{
+    "cross-file-rename",
+    cat(Features),
+    desc("Enable cross-file rename feature. Note that this feature is "
+         "experimental and may lead to broken code or incomplete rename "
+         "results"),
+    init(false),
+    Hidden,
+};
+
 opt<unsigned> WorkerThreadsCount{
     "j",
     cat(Misc),
@@ -435,6 +450,7 @@ int main(int argc, char *argv[]) {
 
   llvm::InitializeAllTargetInfos();
   llvm::sys::PrintStackTraceOnErrorSignal(argv[0]);
+  llvm::sys::SetInterruptFunction(&requestShutdown);
   llvm::cl::SetVersionPrinter([](llvm::raw_ostream &OS) {
     OS << clang::getClangToolFullVersion("clangd") << "\n";
   });
@@ -531,6 +547,10 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   LoggingSession LoggingSession(Logger);
   // Write some initial logs before we start doing any real work.
   log("{0}", clang::getClangToolFullVersion("clangd"));
+// FIXME: abstract this better, and print PID on windows too.
+#ifndef _WIN32
+  log("PID: {0}", getpid());
+#endif
   {
     SmallString<128> CWD;
     if (auto Err = llvm::sys::fs::current_path(CWD))
@@ -595,6 +615,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   }
   Opts.StaticIndex = StaticIdx.get();
   Opts.AsyncThreadsCount = WorkerThreadsCount;
+  Opts.CrossFileRename = CrossFileRename;
 
   clangd::CodeCompleteOptions CCOpts;
   CCOpts.IncludeIneligibleResults = IncludeIneligibleResults;
@@ -683,12 +704,7 @@ clangd accepts flags on the commandline, and in the CLANGD_FLAGS environment var
   // However if a bug causes them to run forever, we want to ensure the process
   // eventually exits. As clangd isn't directly user-facing, an editor can
   // "leak" clangd processes. Crashing in this case contains the damage.
-  //
-  // This is more portable than sys::WatchDog, and yields a stack trace.
-  std::thread([] {
-    std::this_thread::sleep_for(std::chrono::minutes(5));
-    std::abort();
-  }).detach();
+  abortAfterTimeout(std::chrono::minutes(5));
 
   return ExitCode;
 }
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index cb6d611503199..28f18e73d7a85 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -1874,7 +1874,10 @@ TEST(CompletionTest, CompletionTokenRange) {
     Annotations TestCode(Text);
     auto Results = completions(Server, TestCode.code(), TestCode.point());
 
-    EXPECT_EQ(Results.Completions.size(), 1u);
+    if (Results.Completions.size() != 1) {
+      ADD_FAILURE() << "Results.Completions.size() != 1";
+      continue;
+    }
     EXPECT_THAT(Results.Completions.front().CompletionTokenRange,
                 TestCode.range());
   }
diff --git a/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp b/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
index d4438e0a9a0b0..8eee7550bf8e4 100644
--- a/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CollectMacrosTests.cpp
@@ -88,7 +88,7 @@ TEST(CollectMainFileMacros, SelectedMacros) {
         break;
 
       auto Loc = getBeginningOfIdentifier(ExpectedRefs.begin()->start, SM,
-                                          AST.getASTContext().getLangOpts());
+                                          AST.getLangOpts());
       auto Macro = locateMacroAt(Loc, PP);
       assert(Macro);
       auto SID = getSymbolID(Macro->Name, Macro->Info, SM);
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index fe7a8898c5de4..3c0257849021d 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -709,7 +709,10 @@ void bar(X *x) {
 
   auto Parsed = TU.build();
   for (const auto &D : Parsed.getDiagnostics()) {
-    EXPECT_EQ(D.Fixes.size(), 1u);
+    if (D.Fixes.size() != 1) {
+      ADD_FAILURE() << "D.Fixes.size() != 1";
+      continue;
+    }
     EXPECT_EQ(D.Fixes[0].Message,
               std::string("Add include \"a.h\" for symbol X"));
   }
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index f6e5fe723ec71..620eb3d6d3d69 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -114,6 +114,23 @@ TEST_F(TargetDeclTest, Exprs) {
     auto X = S() [[+]] S();
   )cpp";
   EXPECT_DECLS("DeclRefExpr", "S operator+(S) const");
+
+  Code = R"cpp(
+    int foo();
+    int s = foo[[()]];
+  )cpp";
+  EXPECT_DECLS("CallExpr", "int foo()");
+
+  Code = R"cpp(
+    struct X {
+    void operator()(int n);
+    };
+    void test() {
+      X x;
+      x[[(123)]];
+    }
+  )cpp";
+  EXPECT_DECLS("CXXOperatorCallExpr", "void operator()(int n)");
 }
 
 TEST_F(TargetDeclTest, UsingDecl) {
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index 8dedcf579fd33..8a54b552258c1 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -7,10 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "Annotations.h"
+#include "ClangdServer.h"
+#include "SyncAPI.h"
 #include "TestFS.h"
 #include "TestTU.h"
+#include "index/Ref.h"
 #include "refactor/Rename.h"
 #include "clang/Tooling/Core/Replacement.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -18,8 +22,45 @@ namespace clang {
 namespace clangd {
 namespace {
 
-MATCHER_P2(RenameRange, Code, Range, "") {
-  return replacementToEdit(Code, arg).range == Range;
+using testing::Eq;
+using testing::Pair;
+using testing::UnorderedElementsAre;
+
+// Build a RefSlab from all marked ranges in the annotation. The ranges are
+// assumed to associate with the given SymbolName.
+std::unique_ptr<RefSlab> buildRefSlab(const Annotations &Code,
+                                      llvm::StringRef SymbolName,
+                                      llvm::StringRef Path) {
+  RefSlab::Builder Builder;
+  TestTU TU;
+  TU.HeaderCode = Code.code();
+  auto Symbols = TU.headerSymbols();
+  const auto &SymbolID = findSymbol(Symbols, SymbolName).ID;
+  for (const auto &Range : Code.ranges()) {
+    Ref R;
+    R.Kind = RefKind::Reference;
+    R.Location.Start.setLine(Range.start.line);
+    R.Location.Start.setColumn(Range.start.character);
+    R.Location.End.setLine(Range.end.line);
+    R.Location.End.setColumn(Range.end.character);
+    auto U = URI::create(Path).toString();
+    R.Location.FileURI = U.c_str();
+    Builder.insert(SymbolID, R);
+  }
+
+  return std::make_unique<RefSlab>(std::move(Builder).build());
+}
+
+std::vector<
+    std::pair</*FilePath*/ std::string, /*CodeAfterRename*/ std::string>>
+applyEdits(FileEdits FE) {
+  std::vector<std::pair<std::string, std::string>> Results;
+  for (auto &It : FE)
+    Results.emplace_back(
+        It.first().str(),
+        llvm::cantFail(tooling::applyAllReplacements(
+            It.getValue().InitialCode, It.getValue().Replacements)));
+  return Results;
 }
 
 // Generates an expected rename result by replacing all ranges in the given
@@ -363,11 +404,11 @@ TEST(RenameTest, WithinFileRename) {
     llvm::StringRef NewName = "abcde";
     for (const auto &RenamePos : Code.points()) {
       auto RenameResult =
-          renameWithinFile(AST, testPath(TU.Filename), RenamePos, NewName);
-      ASSERT_TRUE(bool(RenameResult)) << RenameResult.takeError() << T;
-      auto ApplyResult = llvm::cantFail(
-          tooling::applyAllReplacements(Code.code(), *RenameResult));
-      EXPECT_EQ(expectedResult(Code, NewName), ApplyResult);
+          rename({RenamePos, NewName, AST, testPath(TU.Filename)});
+      ASSERT_TRUE(bool(RenameResult)) << RenameResult.takeError();
+      ASSERT_EQ(1u, RenameResult->size());
+      EXPECT_EQ(applyEdits(std::move(*RenameResult)).front().second,
+                expectedResult(Code, NewName));
     }
   }
 }
@@ -411,13 +452,20 @@ TEST(RenameTest, Renameable) {
       )cpp",
        "used outside main file", HeaderFile, Index},
 
-      {R"cpp(// disallow -- symbol is not indexable.
+      {R"cpp(// disallow -- symbol in annonymous namespace in header is not indexable.
         namespace {
         class Unin^dexable {};
         }
       )cpp",
        "not eligible for indexing", HeaderFile, Index},
 
+      {R"cpp(// allow -- symbol in annonymous namespace in non-header file is indexable.
+        namespace {
+        class [[F^oo]] {};
+        }
+      )cpp",
+       nullptr, !HeaderFile, Index},
+
       {R"cpp(// disallow -- namespace symbol isn't supported
         namespace n^s {}
       )cpp",
@@ -480,23 +528,23 @@ TEST(RenameTest, Renameable) {
     }
     auto AST = TU.build();
     llvm::StringRef NewName = "dummyNewName";
-    auto Results = renameWithinFile(AST, testPath(TU.Filename), T.point(),
-                                    NewName, Case.Index);
+    auto Results =
+        rename({T.point(), NewName, AST, testPath(TU.Filename), Case.Index});
     bool WantRename = true;
     if (T.ranges().empty())
       WantRename = false;
     if (!WantRename) {
       assert(Case.ErrorMessage && "Error message must be set!");
       EXPECT_FALSE(Results)
-          << "expected renameWithinFile returned an error: " << T.code();
+          << "expected rename returned an error: " << T.code();
       auto ActualMessage = llvm::toString(Results.takeError());
       EXPECT_THAT(ActualMessage, testing::HasSubstr(Case.ErrorMessage));
     } else {
-      EXPECT_TRUE(bool(Results)) << "renameWithinFile returned an error: "
+      EXPECT_TRUE(bool(Results)) << "rename returned an error: "
                                  << llvm::toString(Results.takeError());
-      auto ApplyResult =
-          llvm::cantFail(tooling::applyAllReplacements(T.code(), *Results));
-      EXPECT_EQ(expectedResult(T, NewName), ApplyResult);
+      ASSERT_EQ(1u, Results->size());
+      EXPECT_EQ(applyEdits(std::move(*Results)).front().second,
+                expectedResult(T, NewName));
     }
   }
 }
@@ -522,11 +570,287 @@ TEST(RenameTest, MainFileReferencesOnly) {
   llvm::StringRef NewName = "abcde";
 
   auto RenameResult =
-      renameWithinFile(AST, testPath(TU.Filename), Code.point(), NewName);
+      rename({Code.point(), NewName, AST, testPath(TU.Filename)});
   ASSERT_TRUE(bool(RenameResult)) << RenameResult.takeError() << Code.point();
-  auto ApplyResult =
-      llvm::cantFail(tooling::applyAllReplacements(Code.code(), *RenameResult));
-  EXPECT_EQ(expectedResult(Code, NewName), ApplyResult);
+  ASSERT_EQ(1u, RenameResult->size());
+  EXPECT_EQ(applyEdits(std::move(*RenameResult)).front().second,
+            expectedResult(Code, NewName));
+}
+
+TEST(CrossFileRenameTests, DirtyBuffer) {
+  Annotations FooCode("class [[Foo]] {};");
+  std::string FooPath = testPath("foo.cc");
+  Annotations FooDirtyBuffer("class [[Foo]] {};\n// this is dirty buffer");
+  Annotations BarCode("void [[Bar]]() {}");
+  std::string BarPath = testPath("bar.cc");
+  // Build the index, the index has "Foo" references from foo.cc and "Bar"
+  // references from bar.cc.
+  FileSymbols FSymbols;
+  FSymbols.update(FooPath, nullptr, buildRefSlab(FooCode, "Foo", FooPath),
+                  nullptr, false);
+  FSymbols.update(BarPath, nullptr, buildRefSlab(BarCode, "Bar", BarPath),
+                  nullptr, false);
+  auto Index = FSymbols.buildIndex(IndexType::Light);
+
+  Annotations MainCode("class  [[Fo^o]] {};");
+  auto MainFilePath = testPath("main.cc");
+  // Dirty buffer for foo.cc.
+  auto GetDirtyBuffer = [&](PathRef Path) -> llvm::Optional<std::string> {
+    if (Path == FooPath)
+      return FooDirtyBuffer.code().str();
+    return llvm::None;
+  };
+
+  // Run rename on Foo, there is a dirty buffer for foo.cc, rename should
+  // respect the dirty buffer.
+  TestTU TU = TestTU::withCode(MainCode.code());
+  auto AST = TU.build();
+  llvm::StringRef NewName = "newName";
+  auto Results = rename({MainCode.point(), NewName, AST, MainFilePath,
+                         Index.get(), /*CrossFile=*/true, GetDirtyBuffer});
+  ASSERT_TRUE(bool(Results)) << Results.takeError();
+  EXPECT_THAT(
+      applyEdits(std::move(*Results)),
+      UnorderedElementsAre(
+          Pair(Eq(FooPath), Eq(expectedResult(FooDirtyBuffer, NewName))),
+          Pair(Eq(MainFilePath), Eq(expectedResult(MainCode, NewName)))));
+
+  // Run rename on Bar, there is no dirty buffer for the affected file bar.cc,
+  // so we should read file content from VFS.
+  MainCode = Annotations("void [[Bar]]() { [[B^ar]](); }");
+  TU = TestTU::withCode(MainCode.code());
+  // Set a file "bar.cc" on disk.
+  TU.AdditionalFiles["bar.cc"] = BarCode.code();
+  AST = TU.build();
+  Results = rename({MainCode.point(), NewName, AST, MainFilePath, Index.get(),
+                    /*CrossFile=*/true, GetDirtyBuffer});
+  ASSERT_TRUE(bool(Results)) << Results.takeError();
+  EXPECT_THAT(
+      applyEdits(std::move(*Results)),
+      UnorderedElementsAre(
+          Pair(Eq(BarPath), Eq(expectedResult(BarCode, NewName))),
+          Pair(Eq(MainFilePath), Eq(expectedResult(MainCode, NewName)))));
+
+  // Run rename on a pagination index which couldn't return all refs in one
+  // request, we reject rename on this case.
+  class PaginationIndex : public SymbolIndex {
+    bool refs(const RefsRequest &Req,
+              llvm::function_ref<void(const Ref &)> Callback) const override {
+      return true; // has more references
+    }
+
+    bool fuzzyFind(
+        const FuzzyFindRequest &Req,
+        llvm::function_ref<void(const Symbol &)> Callback) const override {
+      return false;
+    }
+    void
+    lookup(const LookupRequest &Req,
+           llvm::function_ref<void(const Symbol &)> Callback) const override {}
+
+    void relations(const RelationsRequest &Req,
+                   llvm::function_ref<void(const SymbolID &, const Symbol &)>
+                       Callback) const override {}
+    size_t estimateMemoryUsage() const override { return 0; }
+  } PIndex;
+  Results = rename({MainCode.point(), NewName, AST, MainFilePath, &PIndex,
+                    /*CrossFile=*/true, GetDirtyBuffer});
+  EXPECT_FALSE(Results);
+  EXPECT_THAT(llvm::toString(Results.takeError()),
+              testing::HasSubstr("too many occurrences"));
+}
+
+TEST(CrossFileRenameTests, WithUpToDateIndex) {
+  MockCompilationDatabase CDB;
+  CDB.ExtraClangFlags = {"-xc++"};
+  class IgnoreDiagnostics : public DiagnosticsConsumer {
+  void onDiagnosticsReady(PathRef File,
+                          std::vector<Diag> Diagnostics) override {}
+  } DiagConsumer;
+  // rename is runnning on the "^" point in FooH, and "[[]]" ranges are the
+  // expcted rename occurrences.
+  struct Case {
+    llvm::StringRef FooH;
+    llvm::StringRef FooCC;
+  } Cases [] = {
+    {
+      // classes.
+      R"cpp(
+        class [[Fo^o]] {
+          [[Foo]]();
+          ~[[Foo]]();
+        };
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        [[Foo]]::[[Foo]]() {}
+        [[Foo]]::~[[Foo]]() {}
+
+        void func() {
+          [[Foo]] foo;
+        }
+      )cpp",
+    },
+    {
+      // class methods.
+      R"cpp(
+        class Foo {
+          void [[f^oo]]();
+        };
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        void Foo::[[foo]]() {}
+
+        void func(Foo* p) {
+          p->[[foo]]();
+        }
+      )cpp",
+    },
+    {
+      // functions.
+      R"cpp(
+        void [[f^oo]]();
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        void [[foo]]() {}
+
+        void func() {
+          [[foo]]();
+        }
+      )cpp",
+    },
+    {
+      // typedefs.
+      R"cpp(
+      typedef int [[IN^T]];
+      [[INT]] foo();
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        [[INT]] foo() {}
+      )cpp",
+    },
+    {
+      // usings.
+      R"cpp(
+      using [[I^NT]] = int;
+      [[INT]] foo();
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        [[INT]] foo() {}
+      )cpp",
+    },
+    {
+      // variables.
+      R"cpp(
+      static const int [[VA^R]] = 123;
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        int s = [[VAR]];
+      )cpp",
+    },
+    {
+      // scope enums.
+      R"cpp(
+      enum class [[K^ind]] { ABC };
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        [[Kind]] ff() {
+          return [[Kind]]::ABC;
+        }
+      )cpp",
+    },
+    {
+      // enum constants.
+      R"cpp(
+      enum class Kind { [[A^BC]] };
+      )cpp",
+      R"cpp(
+        #include "foo.h"
+        Kind ff() {
+          return Kind::[[ABC]];
+        }
+      )cpp",
+    },
+  };
+
+  for (const auto& T : Cases) {
+    Annotations FooH(T.FooH);
+    Annotations FooCC(T.FooCC);
+    std::string FooHPath = testPath("foo.h");
+    std::string FooCCPath = testPath("foo.cc");
+
+    MockFSProvider FS;
+    FS.Files[FooHPath] = FooH.code();
+    FS.Files[FooCCPath] = FooCC.code();
+
+    auto ServerOpts = ClangdServer::optsForTest();
+    ServerOpts.CrossFileRename = true;
+    ServerOpts.BuildDynamicSymbolIndex = true;
+    ClangdServer Server(CDB, FS, DiagConsumer, ServerOpts);
+
+    // Add all files to clangd server to make sure the dynamic index has been
+    // built.
+    runAddDocument(Server, FooHPath, FooH.code());
+    runAddDocument(Server, FooCCPath, FooCC.code());
+
+    llvm::StringRef NewName = "NewName";
+    auto FileEditsList =
+        llvm::cantFail(runRename(Server, FooHPath, FooH.point(), NewName));
+    EXPECT_THAT(applyEdits(std::move(FileEditsList)),
+                UnorderedElementsAre(
+                    Pair(Eq(FooHPath), Eq(expectedResult(T.FooH, NewName))),
+                    Pair(Eq(FooCCPath), Eq(expectedResult(T.FooCC, NewName)))));
+  }
+}
+
+TEST(CrossFileRenameTests, CrossFileOnLocalSymbol) {
+  // cross-file rename should work for function-local symbols, even there is no
+  // index provided.
+  Annotations Code("void f(int [[abc]]) { [[a^bc]] = 3; }");
+  auto TU = TestTU::withCode(Code.code());
+  auto Path = testPath(TU.Filename);
+  auto AST = TU.build();
+  llvm::StringRef NewName = "newName";
+  auto Results = rename({Code.point(), NewName, AST, Path});
+  ASSERT_TRUE(bool(Results)) << Results.takeError();
+  EXPECT_THAT(
+      applyEdits(std::move(*Results)),
+      UnorderedElementsAre(Pair(Eq(Path), Eq(expectedResult(Code, NewName)))));
+}
+
+TEST(CrossFileRenameTests, BuildRenameEdits) {
+  Annotations Code("[[😂]]");
+  auto LSPRange = Code.range();
+  llvm::StringRef FilePath = "/test/TestTU.cpp";
+  auto Edit = buildRenameEdit(FilePath, Code.code(), {LSPRange}, "abc");
+  ASSERT_TRUE(bool(Edit)) << Edit.takeError();
+  ASSERT_EQ(1UL, Edit->Replacements.size());
+  EXPECT_EQ(FilePath, Edit->Replacements.begin()->getFilePath());
+  EXPECT_EQ(4UL, Edit->Replacements.begin()->getLength());
+
+  // Test invalid range.
+  LSPRange.end = {10, 0}; // out of range
+  Edit = buildRenameEdit(FilePath, Code.code(), {LSPRange}, "abc");
+  EXPECT_FALSE(Edit);
+  EXPECT_THAT(llvm::toString(Edit.takeError()),
+              testing::HasSubstr("fail to convert"));
+
+  // Normal ascii characters.
+  Annotations T(R"cpp(
+    [[range]]
+              [[range]]
+      [[range]]
+  )cpp");
+  Edit = buildRenameEdit(FilePath, T.code(), T.ranges(), "abc");
+  ASSERT_TRUE(bool(Edit)) << Edit.takeError();
+  EXPECT_EQ(applyEdits(FileEdits{{T.code(), std::move(*Edit)}}).front().second,
+            expectedResult(Code, expectedResult(T, "abc")));
 }
 
 } // namespace
diff --git a/clang-tools-extra/clangd/unittests/SelectionTests.cpp b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
index 2803aaaca1c57..9e1a90b55e3ac 100644
--- a/clang-tools-extra/clangd/unittests/SelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SelectionTests.cpp
@@ -40,7 +40,7 @@ Range nodeRange(const SelectionTree::Node *N, ParsedAST &AST) {
   if (!N)
     return Range{};
   const SourceManager &SM = AST.getSourceManager();
-  const LangOptions &LangOpts = AST.getASTContext().getLangOpts();
+  const LangOptions &LangOpts = AST.getLangOpts();
   StringRef Buffer = SM.getBufferData(SM.getMainFileID());
   if (llvm::isa_and_nonnull<TranslationUnitDecl>(N->ASTNode.get<Decl>()))
     return Range{Position{}, offsetToPosition(Buffer, Buffer.size())};
@@ -134,6 +134,15 @@ TEST(SelectionTest, CommonAncestor) {
           )cpp",
           "IfStmt",
       },
+      {
+          R"cpp(
+            int x(int);
+            #define M(foo) x(foo)
+            int a = 42;
+            int b = M([[^a]]);
+          )cpp",
+          "DeclRefExpr",
+      },
       {
           R"cpp(
             void foo();
@@ -234,6 +243,7 @@ TEST(SelectionTest, CommonAncestor) {
       {"void foo() { [[foo^()]]; }", "CallExpr"},
       {"void foo() { [[foo^]] (); }", "DeclRefExpr"},
       {"int bar; void foo() [[{ foo (); }]]^", "CompoundStmt"},
+      {"int x = [[42]]^;", "IntegerLiteral"},
 
       // Ignores whitespace, comments, and semicolons in the selection.
       {"void foo() { [[foo^()]]; /*comment*/^}", "CallExpr"},
@@ -271,7 +281,6 @@ TEST(SelectionTest, CommonAncestor) {
       // FIXME: Ideally we'd get a declstmt or the VarDecl itself here.
       // This doesn't happen now; the RAV doesn't traverse a node containing ;.
       {"int x = 42;^", nullptr},
-      {"int x = 42^;", nullptr},
 
       // Common ancestor is logically TUDecl, but we never return that.
       {"^int x; int y;^", nullptr},
@@ -378,6 +387,7 @@ TEST(SelectionTest, Selected) {
             $C[[return]];
           }]] else [[{^
           }]]]]
+          char z;
         }
       )cpp",
       R"cpp(
@@ -386,10 +396,10 @@ TEST(SelectionTest, Selected) {
           void foo(^$C[[unique_ptr<$C[[unique_ptr<$C[[int]]>]]>]]^ a) {}
       )cpp",
       R"cpp(int a = [[5 >^> 1]];)cpp",
-      R"cpp([[
+      R"cpp(
         #define ECHO(X) X
-        ECHO(EC^HO([[$C[[int]]) EC^HO(a]]));
-      ]])cpp",
+        ECHO(EC^HO($C[[int]]) EC^HO(a));
+      )cpp",
       R"cpp( $C[[^$C[[int]] a^]]; )cpp",
       R"cpp( $C[[^$C[[int]] a = $C[[5]]^]]; )cpp",
   };
@@ -428,6 +438,56 @@ TEST(SelectionTest, PathologicalPreprocessor) {
   EXPECT_EQ("WhileStmt", T.commonAncestor()->Parent->kind());
 }
 
+TEST(SelectionTest, IncludedFile) {
+  const char *Case = R"cpp(
+    void test() {
+#include "Exp^and.inc"
+        break;
+    }
+  )cpp";
+  Annotations Test(Case);
+  auto TU = TestTU::withCode(Test.code());
+  TU.AdditionalFiles["Expand.inc"] = "while(1)\n";
+  auto AST = TU.build();
+  auto T = makeSelectionTree(Case, AST);
+
+  EXPECT_EQ("WhileStmt", T.commonAncestor()->kind());
+}
+
+TEST(SelectionTest, MacroArgExpansion) {
+  // If a macro arg is expanded several times, we consider them all selected.
+  const char *Case = R"cpp(
+    int mul(int, int);
+    #define SQUARE(X) mul(X, X);
+    int nine = SQUARE(^3);
+  )cpp";
+  Annotations Test(Case);
+  auto AST = TestTU::withCode(Test.code()).build();
+  auto T = makeSelectionTree(Case, AST);
+  // Unfortunately, this makes the common ancestor the CallExpr...
+  // FIXME: hack around this by picking one?
+  EXPECT_EQ("CallExpr", T.commonAncestor()->kind());
+  EXPECT_FALSE(T.commonAncestor()->Selected);
+  EXPECT_EQ(2u, T.commonAncestor()->Children.size());
+  for (const auto* N : T.commonAncestor()->Children) {
+    EXPECT_EQ("IntegerLiteral", N->kind());
+    EXPECT_TRUE(N->Selected);
+  }
+
+  // Verify that the common assert() macro doesn't suffer from this.
+  // (This is because we don't associate the stringified token with the arg).
+  Case = R"cpp(
+    void die(const char*);
+    #define assert(x) (x ? (void)0 : die(#x)
+    void foo() { assert(^42); }
+  )cpp";
+  Test = Annotations(Case);
+  AST = TestTU::withCode(Test.code()).build();
+  T = makeSelectionTree(Case, AST);
+
+  EXPECT_EQ("IntegerLiteral", T.commonAncestor()->kind());
+}
+
 TEST(SelectionTest, Implicit) {
   const char* Test = R"cpp(
     struct S { S(const char*); };
diff --git a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
index b9ca0273a8233..f518fea672920 100644
--- a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
@@ -88,11 +88,8 @@ TEST(SemanticSelection, All) {
       R"cpp( // Single statement in TU.
         [[int v = [[1^00]]]];
       )cpp",
-      // FIXME: No node found associated to the position.
       R"cpp( // Cursor at end of VarDecl.
-        void func() {
-          int v = 100 + 100^;
-        }
+        [[int v = [[100]]^]];
       )cpp",
       // FIXME: No node found associated to the position.
       R"cpp( // Cursor in between spaces.
diff --git a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
index 0dabce2a3d64d..5979261600bbf 100644
--- a/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SourceCodeTests.cpp
@@ -358,7 +358,7 @@ Bar* bar;
     auto AST = TestTU::withCode(TestCase.code()).build();
     const auto &SourceMgr = AST.getSourceManager();
     SourceLocation Actual = getBeginningOfIdentifier(
-        TestCase.points().back(), SourceMgr, AST.getASTContext().getLangOpts());
+        TestCase.points().back(), SourceMgr, AST.getLangOpts());
     Position ActualPos = offsetToPosition(
         TestCase.code(),
         SourceMgr.getFileOffset(SourceMgr.getSpellingLoc(Actual)));
@@ -482,7 +482,7 @@ TEST(SourceCodeTests, GetMacros) {
   TestTU TU = TestTU::withCode(Code.code());
   auto AST = TU.build();
   auto Loc = getBeginningOfIdentifier(Code.point(), AST.getSourceManager(),
-                                      AST.getASTContext().getLangOpts());
+                                      AST.getLangOpts());
   auto Result = locateMacroAt(Loc, AST.getPreprocessor());
   ASSERT_TRUE(Result);
   EXPECT_THAT(*Result, MacroName("MACRO"));
@@ -548,7 +548,7 @@ TEST(SourceCodeTests, HalfOpenFileRange) {
   ParsedAST AST = TestTU::withCode(Test.code()).build();
   llvm::errs() << Test.code();
   const SourceManager &SM = AST.getSourceManager();
-  const LangOptions &LangOpts = AST.getASTContext().getLangOpts();
+  const LangOptions &LangOpts = AST.getLangOpts();
   // Turn a SourceLocation into a pair of positions
   auto SourceRangeToRange = [&SM](SourceRange SrcRange) {
     return Range{sourceLocToPosition(SM, SrcRange.getBegin()),
@@ -588,8 +588,7 @@ TEST(SourceCodeTests, HalfOpenFileRangePathologicalPreprocessor) {
   const auto &Body = cast<CompoundStmt>(Func.getBody());
   const auto &Loop = cast<WhileStmt>(*Body->child_begin());
   llvm::Optional<SourceRange> Range = toHalfOpenFileRange(
-      AST.getSourceManager(), AST.getASTContext().getLangOpts(),
-      Loop->getSourceRange());
+      AST.getSourceManager(), AST.getLangOpts(), Loop->getSourceRange());
   ASSERT_TRUE(Range) << "Failed to get file range";
   EXPECT_EQ(AST.getSourceManager().getFileOffset(Range->getBegin()),
             Test.llvm::Annotations::range().Begin);
diff --git a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
index d737862fa0465..abc7aa389bd54 100644
--- a/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolCollectorTests.cpp
@@ -39,6 +39,7 @@ using ::testing::Contains;
 using ::testing::Each;
 using ::testing::ElementsAre;
 using ::testing::Field;
+using ::testing::IsEmpty;
 using ::testing::Not;
 using ::testing::Pair;
 using ::testing::UnorderedElementsAre;
@@ -214,7 +215,8 @@ class SymbolIndexActionFactory : public tooling::FrontendActionFactory {
       CreateASTConsumer(CompilerInstance &CI, llvm::StringRef InFile) override {
         if (PragmaHandler)
           CI.getPreprocessor().addCommentHandler(PragmaHandler);
-        return createIndexingASTConsumer(DataConsumer, Opts, CI.getPreprocessorPtr());
+        return createIndexingASTConsumer(DataConsumer, Opts,
+                                         CI.getPreprocessorPtr());
       }
 
       bool BeginInvocation(CompilerInstance &CI) override {
@@ -577,15 +579,16 @@ o]]();
 
 TEST_F(SymbolCollectorTest, Refs) {
   Annotations Header(R"(
-  class $foo[[Foo]] {
+  #define MACRO(X) (X + 1)
+  class Foo {
   public:
-    $foo[[Foo]]() {}
-    $foo[[Foo]](int);
+    Foo() {}
+    Foo(int);
   };
-  class $bar[[Bar]];
-  void $func[[func]]();
+  class Bar;
+  void func();
 
-  namespace $ns[[NS]] {} // namespace ref is ignored
+  namespace NS {} // namespace ref is ignored
   )");
   Annotations Main(R"(
   class $bar[[Bar]] {};
@@ -598,19 +601,20 @@ TEST_F(SymbolCollectorTest, Refs) {
     $func[[func]]();
     int abc = 0;
     $foo[[Foo]] foo2 = abc;
+    abc = $macro[[MACRO]](1);
   }
   )");
   Annotations SymbolsOnlyInMainCode(R"(
+  #define FUNC(X) (X+1)
   int a;
   void b() {}
-  static const int c = 0;
+  static const int c = FUNC(1);
   class d {};
   )");
   CollectorOpts.RefFilter = RefKind::All;
+  CollectorOpts.CollectMacro = true;
   runSymbolCollector(Header.code(),
                      (Main.code() + SymbolsOnlyInMainCode.code()).str());
-  auto HeaderSymbols = TestTU::withHeaderCode(Header.code()).headerSymbols();
-
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "Foo").ID,
                                   HaveRanges(Main.ranges("foo")))));
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "Bar").ID,
@@ -618,12 +622,82 @@ TEST_F(SymbolCollectorTest, Refs) {
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "func").ID,
                                   HaveRanges(Main.ranges("func")))));
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(Symbols, "NS").ID, _))));
-  // Symbols *only* in the main file (a, b, c) had no refs collected.
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACRO").ID,
+                                  HaveRanges(Main.ranges("macro")))));
+  // Symbols *only* in the main file (a, b, c, FUNC) had no refs collected.
   auto MainSymbols =
       TestTU::withHeaderCode(SymbolsOnlyInMainCode.code()).headerSymbols();
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "a").ID, _))));
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "b").ID, _))));
   EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "c").ID, _))));
+  EXPECT_THAT(Refs, Not(Contains(Pair(findSymbol(MainSymbols, "FUNC").ID, _))));
+}
+
+TEST_F(SymbolCollectorTest, MacroRefInHeader) {
+  Annotations Header(R"(
+  #define $foo[[FOO]](X) (X + 1)
+  #define $bar[[BAR]](X) (X + 2)
+
+  // Macro defined multiple times.
+  #define $ud1[[UD]] 1
+  int ud_1 = $ud1[[UD]];
+  #undef UD
+
+  #define $ud2[[UD]] 2
+  int ud_2 = $ud2[[UD]];
+  #undef UD
+
+  // Macros from token concatenations not included.
+  #define $concat[[CONCAT]](X) X##A()
+  #define $prepend[[PREPEND]](X) MACRO##X()
+  #define $macroa[[MACROA]]() 123
+  int B = $concat[[CONCAT]](MACRO);
+  int D = $prepend[[PREPEND]](A);
+
+  void fff() {
+    int abc = $foo[[FOO]](1) + $bar[[BAR]]($foo[[FOO]](1));
+  }
+  )");
+  CollectorOpts.RefFilter = RefKind::All;
+  CollectorOpts.RefsInHeaders = true;
+  // Need this to get the SymbolID for macros for tests.
+  CollectorOpts.CollectMacro = true;
+
+  runSymbolCollector(Header.code(), "");
+
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "FOO").ID,
+                                  HaveRanges(Header.ranges("foo")))));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "BAR").ID,
+                                  HaveRanges(Header.ranges("bar")))));
+  // No unique ID for multiple symbols named UD. Check for ranges only.
+  EXPECT_THAT(Refs, Contains(Pair(_, HaveRanges(Header.ranges("ud1")))));
+  EXPECT_THAT(Refs, Contains(Pair(_, HaveRanges(Header.ranges("ud2")))));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "CONCAT").ID,
+                                  HaveRanges(Header.ranges("concat")))));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "PREPEND").ID,
+                                  HaveRanges(Header.ranges("prepend")))));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACROA").ID,
+                                  HaveRanges(Header.ranges("macroa")))));
+}
+
+TEST_F(SymbolCollectorTest, MacroRefWithoutCollectingSymbol) {
+  Annotations Header(R"(
+  #define $foo[[FOO]](X) (X + 1)
+  int abc = $foo[[FOO]](1);
+  )");
+  CollectorOpts.RefFilter = RefKind::All;
+  CollectorOpts.RefsInHeaders = true;
+  CollectorOpts.CollectMacro = false;
+  runSymbolCollector(Header.code(), "");
+  EXPECT_THAT(Refs, Contains(Pair(_, HaveRanges(Header.ranges("foo")))));
+}
+
+TEST_F(SymbolCollectorTest, MacrosWithRefFilter) {
+  Annotations Header("#define $macro[[MACRO]](X) (X + 1)");
+  Annotations Main("void foo() { int x = $macro[[MACRO]](1); }");
+  CollectorOpts.RefFilter = RefKind::Unknown;
+  runSymbolCollector(Header.code(), Main.code());
+  EXPECT_THAT(Refs, IsEmpty());
 }
 
 TEST_F(SymbolCollectorTest, NameReferences) {
@@ -675,21 +749,26 @@ TEST_F(SymbolCollectorTest, HeaderAsMainFile) {
   TestFileName = testPath("foo.hh");
   runSymbolCollector("", Header.code());
   EXPECT_THAT(Symbols, UnorderedElementsAre(QName("Foo"), QName("Func")));
-  EXPECT_THAT(Refs, UnorderedElementsAre(Pair(findSymbol(Symbols, "Foo").ID,
-                                  HaveRanges(Header.ranges("Foo"))),
-                             Pair(findSymbol(Symbols, "Func").ID,
-                                  HaveRanges(Header.ranges("Func")))));
+  EXPECT_THAT(Refs,
+              UnorderedElementsAre(Pair(findSymbol(Symbols, "Foo").ID,
+                                        HaveRanges(Header.ranges("Foo"))),
+                                   Pair(findSymbol(Symbols, "Func").ID,
+                                        HaveRanges(Header.ranges("Func")))));
 }
 
 TEST_F(SymbolCollectorTest, RefsInHeaders) {
   CollectorOpts.RefFilter = RefKind::All;
   CollectorOpts.RefsInHeaders = true;
+  CollectorOpts.CollectMacro = true;
   Annotations Header(R"(
-  class [[Foo]] {};
+  #define $macro[[MACRO]](x) (x+1)
+  class $foo[[Foo]] {};
   )");
   runSymbolCollector(Header.code(), "");
   EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "Foo").ID,
-                                  HaveRanges(Header.ranges()))));
+                                  HaveRanges(Header.ranges("foo")))));
+  EXPECT_THAT(Refs, Contains(Pair(findSymbol(Symbols, "MACRO").ID,
+                                  HaveRanges(Header.ranges("macro")))));
 }
 
 TEST_F(SymbolCollectorTest, Relations) {
@@ -704,7 +783,7 @@ TEST_F(SymbolCollectorTest, Relations) {
               Contains(Relation{Base.ID, RelationKind::BaseOf, Derived.ID}));
 }
 
-TEST_F(SymbolCollectorTest, References) {
+TEST_F(SymbolCollectorTest, CountReferences) {
   const std::string Header = R"(
     class W;
     class X {};
diff --git a/clang-tools-extra/clangd/unittests/SyncAPI.cpp b/clang-tools-extra/clangd/unittests/SyncAPI.cpp
index 812fa7a0f2ecb..5c7949ab41baf 100644
--- a/clang-tools-extra/clangd/unittests/SyncAPI.cpp
+++ b/clang-tools-extra/clangd/unittests/SyncAPI.cpp
@@ -96,11 +96,10 @@ runFindDocumentHighlights(ClangdServer &Server, PathRef File, Position Pos) {
   return std::move(*Result);
 }
 
-llvm::Expected<std::vector<TextEdit>> runRename(ClangdServer &Server,
-                                                PathRef File, Position Pos,
-                                                llvm::StringRef NewName) {
-  llvm::Optional<llvm::Expected<std::vector<TextEdit>>> Result;
-  Server.rename(File, Pos, NewName, /*WantFormat=*/true, capture(Result));
+llvm::Expected<FileEdits> runRename(ClangdServer &Server, PathRef File,
+                                    Position Pos, llvm::StringRef NewName) {
+  llvm::Optional<llvm::Expected<FileEdits>> Result;
+  Server.rename(File, Pos, NewName, /*WantFormat=*/false, capture(Result));
   return std::move(*Result);
 }
 
diff --git a/clang-tools-extra/clangd/unittests/SyncAPI.h b/clang-tools-extra/clangd/unittests/SyncAPI.h
index 5ffed1fbb120c..55a538ef6a977 100644
--- a/clang-tools-extra/clangd/unittests/SyncAPI.h
+++ b/clang-tools-extra/clangd/unittests/SyncAPI.h
@@ -38,8 +38,8 @@ runLocateSymbolAt(ClangdServer &Server, PathRef File, Position Pos);
 llvm::Expected<std::vector<DocumentHighlight>>
 runFindDocumentHighlights(ClangdServer &Server, PathRef File, Position Pos);
 
-llvm::Expected<std::vector<TextEdit>>
-runRename(ClangdServer &Server, PathRef File, Position Pos, StringRef NewName);
+llvm::Expected<FileEdits> runRename(ClangdServer &Server, PathRef File,
+                                    Position Pos, StringRef NewName);
 
 std::string runDumpAST(ClangdServer &Server, PathRef File);
 
diff --git a/clang-tools-extra/clangd/unittests/TweakTesting.cpp b/clang-tools-extra/clangd/unittests/TweakTesting.cpp
index 3331a3d937155..7f9f75c081987 100644
--- a/clang-tools-extra/clangd/unittests/TweakTesting.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTesting.cpp
@@ -63,12 +63,14 @@ std::pair<unsigned, unsigned> rangeOrPoint(const Annotations &A) {
           cantFail(positionToOffset(A.code(), SelectionRng.end))};
 }
 
-MATCHER_P6(TweakIsAvailable, TweakID, Ctx, Header, ExtraArgs, ExtraFiles, Index,
+MATCHER_P7(TweakIsAvailable, TweakID, Ctx, Header, ExtraArgs, ExtraFiles, Index,
+           FileName,
            (TweakID + (negation ? " is unavailable" : " is available")).str()) {
   std::string WrappedCode = wrap(Ctx, arg);
   Annotations Input(WrappedCode);
   auto Selection = rangeOrPoint(Input);
   TestTU TU;
+  TU.Filename = FileName;
   TU.HeaderCode = Header;
   TU.Code = Input.code();
   TU.ExtraArgs = ExtraArgs;
@@ -91,6 +93,7 @@ std::string TweakTest::apply(llvm::StringRef MarkedCode,
   auto Selection = rangeOrPoint(Input);
 
   TestTU TU;
+  TU.Filename = FileName;
   TU.HeaderCode = Header;
   TU.AdditionalFiles = std::move(ExtraFiles);
   TU.Code = Input.code();
@@ -124,7 +127,7 @@ std::string TweakTest::apply(llvm::StringRef MarkedCode,
         ADD_FAILURE() << "There were changes to additional files, but client "
                          "provided a nullptr for EditedFiles.";
       else
-        EditedFiles->try_emplace(It.first(), Unwrapped.str());
+        EditedFiles->insert_or_assign(It.first(), Unwrapped.str());
     }
   }
   return EditedMainFile;
@@ -132,7 +135,7 @@ std::string TweakTest::apply(llvm::StringRef MarkedCode,
 
 ::testing::Matcher<llvm::StringRef> TweakTest::isAvailable() const {
   return TweakIsAvailable(llvm::StringRef(TweakID), Context, Header, ExtraArgs,
-                          ExtraFiles, Index.get());
+                          ExtraFiles, Index.get(), FileName);
 }
 
 std::vector<std::string> TweakTest::expandCases(llvm::StringRef MarkedCode) {
diff --git a/clang-tools-extra/clangd/unittests/TweakTesting.h b/clang-tools-extra/clangd/unittests/TweakTesting.h
index ffcf5a0c7ea2a..10186f859bae2 100644
--- a/clang-tools-extra/clangd/unittests/TweakTesting.h
+++ b/clang-tools-extra/clangd/unittests/TweakTesting.h
@@ -12,6 +12,7 @@
 #include "TestTU.h"
 #include "index/Index.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <memory>
@@ -62,6 +63,8 @@ class TweakTest : public ::testing::Test {
   // testcases.
   std::string Header;
 
+  llvm::StringRef FileName = "TestTU.cpp";
+
   // Extra flags passed to the compilation in apply().
   std::vector<const char *> ExtraArgs;
 
diff --git a/clang-tools-extra/clangd/unittests/TweakTests.cpp b/clang-tools-extra/clangd/unittests/TweakTests.cpp
index 4e481241acd8c..f45866a52bd53 100644
--- a/clang-tools-extra/clangd/unittests/TweakTests.cpp
+++ b/clang-tools-extra/clangd/unittests/TweakTests.cpp
@@ -122,6 +122,25 @@ literal)")cpp";
   EXPECT_EQ(apply(Input), Output);
 }
 
+TWEAK_TEST(ObjCLocalizeStringLiteral);
+TEST_F(ObjCLocalizeStringLiteralTest, Test) {
+  ExtraArgs.push_back("-x");
+  ExtraArgs.push_back("objective-c");
+
+  // Ensure the the action can be initiated in the string literal.
+  EXPECT_AVAILABLE(R"(id x = ^[[@[[^"^t^est^"]]]];)");
+
+  // Ensure that the action can't be initiated in other places.
+  EXPECT_UNAVAILABLE(R"([[i^d ^[[x]] ^= @"test";^]])");
+
+  // Ensure that the action is not available for regular C strings.
+  EXPECT_UNAVAILABLE(R"(const char * x= "^test";)");
+
+  const char *Input = R"(id x = [[@"test"]];)";
+  const char *Output = R"(id x = NSLocalizedString(@"test", @"");)";
+  EXPECT_EQ(apply(Input), Output);
+}
+
 TWEAK_TEST(DumpAST);
 TEST_F(DumpASTTest, Test) {
   EXPECT_AVAILABLE("^int f^oo() { re^turn 2 ^+ 2; }");
@@ -269,7 +288,7 @@ TEST_F(ExtractVariableTest, Test) {
   EXPECT_UNAVAILABLE(UnavailableCases);
 
   // vector of pairs of input and output strings
-  const std::vector<std::pair<llvm::StringLiteral, llvm::StringLiteral>>
+  const std::vector<std::pair<std::string, std::string>>
       InputOutputs = {
           // extraction from variable declaration/assignment
           {R"cpp(void varDecl() {
@@ -321,17 +340,10 @@ TEST_F(ExtractVariableTest, Test) {
                    if(1)
                     LOOP(5 + [[3]])
                  })cpp",
-           /*FIXME: It should be extracted like this. SelectionTree needs to be
-             * fixed for macros.
             R"cpp(#define LOOP(x) while (1) {a = x;}
-                void f(int a) {
-                  auto dummy = 3; if(1)
-                   LOOP(5 + dummy)
-                })cpp"},*/
-           R"cpp(#define LOOP(x) while (1) {a = x;}
                  void f(int a) {
-                   auto dummy = LOOP(5 + 3); if(1)
-                    dummy
+                   auto dummy = 3; if(1)
+                    LOOP(5 + dummy)
                  })cpp"},
           {R"cpp(#define LOOP(x) do {x;} while(1);
                  void f(int a) {
@@ -644,13 +656,18 @@ void f(const int c) {
   )cpp";
   EXPECT_EQ(apply(TemplateFailInput), "unavailable");
 
-  // FIXME: This should be extractable after selectionTree works correctly for
-  // macros (currently it doesn't select anything for the following case)
-  std::string MacroFailInput = R"cpp(
+  std::string MacroInput = R"cpp(
     #define F(BODY) void f() { BODY }
     F ([[int x = 0;]])
   )cpp";
-  EXPECT_EQ(apply(MacroFailInput), "unavailable");
+  std::string MacroOutput = R"cpp(
+    #define F(BODY) void f() { BODY }
+    void extracted() {
+int x = 0;
+}
+F (extracted();)
+  )cpp";
+  EXPECT_EQ(apply(MacroInput), MacroOutput);
 
   // Shouldn't crash.
   EXPECT_EQ(apply("void f([[int a]]);"), "unavailable");
@@ -1809,6 +1826,276 @@ TEST_F(DefineInlineTest, QualifyWithUsingDirectives) {
   EXPECT_EQ(apply(Test), Expected) << Test;
 }
 
+TWEAK_TEST(DefineOutline);
+TEST_F(DefineOutlineTest, TriggersOnFunctionDecl) {
+  FileName = "Test.cpp";
+  // Not available unless in a header file.
+  EXPECT_UNAVAILABLE(R"cpp(
+    [[void [[f^o^o]]() [[{
+      return;
+    }]]]])cpp");
+
+  FileName = "Test.hpp";
+  // Not available unless function name or fully body is selected.
+  EXPECT_UNAVAILABLE(R"cpp(
+    // Not a definition
+    vo^i[[d^ ^f]]^oo();
+
+    [[vo^id ]]foo[[()]] {[[
+      [[(void)(5+3);
+      return;]]
+    }]])cpp");
+
+  // Available even if there are no implementation files.
+  EXPECT_AVAILABLE(R"cpp(
+    [[void [[f^o^o]]() [[{
+      return;
+    }]]]])cpp");
+
+  // Not available for out-of-line methods.
+  EXPECT_UNAVAILABLE(R"cpp(
+    class Bar {
+      void baz();
+    };
+
+    [[void [[Bar::[[b^a^z]]]]() [[{
+      return;
+    }]]]])cpp");
+
+  // Basic check for function body and signature.
+  EXPECT_AVAILABLE(R"cpp(
+    class Bar {
+      [[void [[f^o^o]]() [[{ return; }]]]]
+    };
+
+    void foo();
+    [[void [[f^o^o]]() [[{
+      return;
+    }]]]])cpp");
+
+  // Not available on defaulted/deleted members.
+  EXPECT_UNAVAILABLE(R"cpp(
+    class Foo {
+      Fo^o() = default;
+      F^oo(const Foo&) = delete;
+    };)cpp");
+
+  // Not available within templated classes, as it is hard to spell class name
+  // out-of-line in such cases.
+  EXPECT_UNAVAILABLE(R"cpp(
+    template <typename> struct Foo { void fo^o(){} };
+    })cpp");
+}
+
+TEST_F(DefineOutlineTest, FailsWithoutSource) {
+  FileName = "Test.hpp";
+  llvm::StringRef Test = "void fo^o() { return; }";
+  llvm::StringRef Expected =
+      "fail: Couldn't find a suitable implementation file.";
+  EXPECT_EQ(apply(Test), Expected);
+}
+
+TEST_F(DefineOutlineTest, ApplyTest) {
+  llvm::StringMap<std::string> EditedFiles;
+  ExtraFiles["Test.cpp"] = "";
+  FileName = "Test.hpp";
+  // Template body is not parsed until instantiation time on windows, which
+  // results in arbitrary failures as function body becomes NULL.
+  ExtraArgs.push_back("-fno-delayed-template-parsing");
+
+  struct {
+    llvm::StringRef Test;
+    llvm::StringRef ExpectedHeader;
+    llvm::StringRef ExpectedSource;
+  } Cases[] = {
+      // Simple check
+      {
+          "void fo^o() { return; }",
+          "void foo() ;",
+          "void foo() { return; }",
+      },
+      // Templated function.
+      {
+          "template <typename T> void fo^o(T, T x) { return; }",
+          "template <typename T> void foo(T, T x) ;",
+          "template <typename T> void foo(T, T x) { return; }",
+      },
+      {
+          "template <typename> void fo^o() { return; }",
+          "template <typename> void foo() ;",
+          "template <typename> void foo() { return; }",
+      },
+      // Template specialization.
+      {
+          R"cpp(
+            template <typename> void foo();
+            template <> void fo^o<int>() { return; })cpp",
+          R"cpp(
+            template <typename> void foo();
+            template <> void foo<int>() ;)cpp",
+          "template <> void foo<int>() { return; }",
+      },
+  };
+  for (const auto &Case : Cases) {
+    SCOPED_TRACE(Case.Test);
+    EXPECT_EQ(apply(Case.Test, &EditedFiles), Case.ExpectedHeader);
+    EXPECT_THAT(EditedFiles, testing::ElementsAre(FileWithContents(
+                                 testPath("Test.cpp"), Case.ExpectedSource)));
+  }
+}
+
+TEST_F(DefineOutlineTest, HandleMacros) {
+  llvm::StringMap<std::string> EditedFiles;
+  ExtraFiles["Test.cpp"] = "";
+  FileName = "Test.hpp";
+
+  struct {
+    llvm::StringRef Test;
+    llvm::StringRef ExpectedHeader;
+    llvm::StringRef ExpectedSource;
+  } Cases[] = {
+      {R"cpp(
+          #define BODY { return; }
+          void f^oo()BODY)cpp",
+       R"cpp(
+          #define BODY { return; }
+          void foo();)cpp",
+       "void foo()BODY"},
+
+      {R"cpp(
+          #define BODY return;
+          void f^oo(){BODY})cpp",
+       R"cpp(
+          #define BODY return;
+          void foo();)cpp",
+       "void foo(){BODY}"},
+
+      {R"cpp(
+          #define TARGET void foo()
+          [[TARGET]]{ return; })cpp",
+       R"cpp(
+          #define TARGET void foo()
+          TARGET;)cpp",
+       "TARGET{ return; }"},
+
+      {R"cpp(
+          #define TARGET foo
+          void [[TARGET]](){ return; })cpp",
+       R"cpp(
+          #define TARGET foo
+          void TARGET();)cpp",
+       "void TARGET(){ return; }"},
+  };
+  for (const auto &Case : Cases) {
+    SCOPED_TRACE(Case.Test);
+    EXPECT_EQ(apply(Case.Test, &EditedFiles), Case.ExpectedHeader);
+    EXPECT_THAT(EditedFiles, testing::ElementsAre(FileWithContents(
+                                 testPath("Test.cpp"), Case.ExpectedSource)));
+  }
+}
+
+TEST_F(DefineOutlineTest, QualifyReturnValue) {
+  FileName = "Test.hpp";
+  ExtraFiles["Test.cpp"] = "";
+
+  struct {
+    llvm::StringRef Test;
+    llvm::StringRef ExpectedHeader;
+    llvm::StringRef ExpectedSource;
+  } Cases[] = {
+      {R"cpp(
+        namespace a { class Foo; }
+        using namespace a;
+        Foo fo^o() { return; })cpp",
+       R"cpp(
+        namespace a { class Foo; }
+        using namespace a;
+        Foo foo() ;)cpp",
+       "a::Foo foo() { return; }"},
+      {R"cpp(
+        namespace a {
+          class Foo {
+            class Bar {};
+            Bar fo^o() { return {}; }
+          };
+        })cpp",
+       R"cpp(
+        namespace a {
+          class Foo {
+            class Bar {};
+            Bar foo() ;
+          };
+        })cpp",
+       "a::Foo::Bar a::Foo::foo() { return {}; }\n"},
+      {R"cpp(
+        class Foo;
+        Foo fo^o() { return; })cpp",
+       R"cpp(
+        class Foo;
+        Foo foo() ;)cpp",
+       "Foo foo() { return; }"},
+  };
+  llvm::StringMap<std::string> EditedFiles;
+  for (auto &Case : Cases) {
+    apply(Case.Test, &EditedFiles);
+    EXPECT_EQ(apply(Case.Test, &EditedFiles), Case.ExpectedHeader);
+    EXPECT_THAT(EditedFiles, testing::ElementsAre(FileWithContents(
+                                 testPath("Test.cpp"), Case.ExpectedSource)));
+  }
+}
+
+TEST_F(DefineOutlineTest, QualifyFunctionName) {
+  FileName = "Test.hpp";
+  struct {
+    llvm::StringRef TestHeader;
+    llvm::StringRef TestSource;
+    llvm::StringRef ExpectedHeader;
+    llvm::StringRef ExpectedSource;
+  } Cases[] = {
+      {
+          R"cpp(
+            namespace a {
+              namespace b {
+                class Foo {
+                  void fo^o() {}
+                };
+              }
+            })cpp",
+          "",
+          R"cpp(
+            namespace a {
+              namespace b {
+                class Foo {
+                  void foo() ;
+                };
+              }
+            })cpp",
+          "void a::b::Foo::foo() {}\n",
+      },
+      {
+          "namespace a { namespace b { void f^oo() {} } }",
+          "namespace a{}",
+          "namespace a { namespace b { void foo() ; } }",
+          "namespace a{void b::foo() {} }",
+      },
+      {
+          "namespace a { namespace b { void f^oo() {} } }",
+          "using namespace a;",
+          "namespace a { namespace b { void foo() ; } }",
+          // FIXME: Take using namespace directives in the source file into
+          // account. This can be spelled as b::foo instead.
+          "using namespace a;void a::b::foo() {} ",
+      },
+  };
+  llvm::StringMap<std::string> EditedFiles;
+  for (auto &Case : Cases) {
+    ExtraFiles["Test.cpp"] = Case.TestSource;
+    EXPECT_EQ(apply(Case.TestHeader, &EditedFiles), Case.ExpectedHeader);
+    EXPECT_THAT(EditedFiles, testing::ElementsAre(FileWithContents(
+                                 testPath("Test.cpp"), Case.ExpectedSource)))
+        << Case.TestHeader;
+  }
+}
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index b96feecdf3d61..91a196deb6f41 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -196,6 +196,14 @@ Improvements to clang-tidy
   <clang-tidy/checks/readability-redundant-string-init>` check now supports a
   `StringNames` option enabling its application to custom string classes.
 
+- Improved :doc:`modernize-avoid-bind
+  <clang-tidy/checks/modernize-avoid-bind>` check.
+
+  The check now supports supports diagnosing and fixing arbitrary callables instead of
+  only simple free functions. The `PermissiveParameterList` option has also been
+  added to address situations where the existing fix-it logic would sometimes generate
+  code that no longer compiles.
+
 Improvements to include-fixer
 -----------------------------
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/modernize-avoid-bind.rst b/clang-tools-extra/docs/clang-tidy/checks/modernize-avoid-bind.rst
index 7ea9beca8e882..82c290e4a21b7 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/modernize-avoid-bind.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/modernize-avoid-bind.rst
@@ -3,10 +3,15 @@
 modernize-avoid-bind
 ====================
 
-The check finds uses of ``std::bind`` and replaces simple uses with lambdas.
-Lambdas will use value-capture where required.
+The check finds uses of ``std::bind`` and ``boost::bind`` and replaces them
+with lambdas. Lambdas will use value-capture unless reference capture is
+explicitly requested with ``std::ref`` or ``boost::ref``.
 
-Right now it only handles free functions, not member functions.
+It supports arbitrary callables including member functions, function objects,
+and free functions, and all variations thereof. Anything that you can pass
+to the first argument of ``bind`` should be diagnosable. Currently, the only
+known case where a fix-it is unsupported is when the same placeholder is
+specified multiple times in the parameter list.
 
 Given:
 
@@ -35,3 +40,49 @@ is replaced by:
 ``std::bind`` can be hard to read and can result in larger object files and
 binaries due to type information that will not be produced by equivalent
 lambdas.
+
+Options
+-------
+
+.. option:: PermissiveParameterList
+
+  If the option is set to non-zero, the check will append ``auto&&...`` to the end
+  of every placeholder parameter list. Without this, it is possible for a fix-it
+  to perform an incorrect transformation in the case where the result of the ``bind``
+  is used in the context of a type erased functor such as ``std::function`` which
+  allows mismatched arguments. For example:
+  
+
+.. code-block:: c++
+
+  int add(int x, int y) { return x + y; }
+  int foo() {
+    std::function<int(int,int)> ignore_args = std::bind(add, 2, 2);
+    return ignore_args(3, 3);
+  }
+
+is valid code, and returns `4`. The actual values passed to ``ignore_args`` are
+simply ignored. Without ``PermissiveParameterList``, this would be transformed into
+
+.. code-block:: c++
+
+  int add(int x, int y) { return x + y; }
+  int foo() {
+    std::function<int(int,int)> ignore_args = [] { return add(2, 2); }
+    return ignore_args(3, 3);
+  }
+
+which will *not* compile, since the lambda does not contain an ``operator()`` that
+that accepts 2 arguments. With permissive parameter list, it instead generates
+
+.. code-block:: c++
+
+  int add(int x, int y) { return x + y; }
+  int foo() {
+    std::function<int(int,int)> ignore_args = [](auto&&...) { return add(2, 2); }
+    return ignore_args(3, 3);
+  }
+
+which is correct.
+  
+This check requires using C++14 or higher to run.
diff --git a/clang-tools-extra/test/clang-change-namespace/macro.cpp b/clang-tools-extra/test/clang-change-namespace/macro.cpp
index ba47de603da81..40c4caf058993 100644
--- a/clang-tools-extra/test/clang-change-namespace/macro.cpp
+++ b/clang-tools-extra/test/clang-change-namespace/macro.cpp
@@ -1,7 +1,7 @@
 // RUN: cp %S/macro.cpp %T/macro.cpp
 // RUN: echo "#define USING using na::nc::X" > %T/macro.h
 //
-// RUN: clang-change-namespace -old_namespace "na::nb" -new_namespace "x::y" --file_pattern "macro.cpp" --i %T/macro.cpp --
+// RUN: clang-change-namespace -old_namespace "na::nb" -new_namespace "x::y" --file_pattern "macro.cpp$" --i %T/macro.cpp --
 // RUN: FileCheck -input-file=%T/macro.cpp -check-prefix=CHECK-CC %s
 // RUN: FileCheck -input-file=%T/macro.h -check-prefix=CHECK-HEADER %s
 //
diff --git a/clang-tools-extra/test/clang-tidy/bugprone-suspicious-semicolon-constexpr.cpp b/clang-tools-extra/test/clang-tidy/bugprone-suspicious-semicolon-constexpr.cpp
new file mode 100644
index 0000000000000..c18dd7bd1e932
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/bugprone-suspicious-semicolon-constexpr.cpp
@@ -0,0 +1,31 @@
+// RUN: %check_clang_tidy %s bugprone-suspicious-semicolon %t -- -- -std=c++17
+
+void fail()
+{
+  int x = 0;
+  if(x > 5); (void)x;
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: potentially unintended semicolon [bugprone-suspicious-semicolon]
+  // CHECK-FIXES: if(x > 5) (void)x;
+}
+
+template <int X>
+int foo(int a) {
+    if constexpr(X > 0) {
+        return a;
+    }
+    return a + 1;
+}
+
+template <int X>
+int foo2(int a) {
+    // FIXME: diagnose the case below. See https://reviews.llvm.org/D46234
+    // for details.
+    if constexpr(X > 0);
+        return a;
+    return a + 1;
+}
+
+int main(void) {
+    foo2<0>(1);
+    return foo<0>(1);
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-macro-parentheses.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-macro-parentheses.cpp
index 2cc45e83b2037..8d128352e7894 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-macro-parentheses.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-macro-parentheses.cpp
@@ -43,6 +43,7 @@
 #define GOOD30(args...)   std::cout << args;
 #define GOOD31(X)         A*X=2
 #define GOOD32(X)         std::vector<X>
+#define GOOD33(x)         if (!a__##x) a_##x = &f(#x)
 
 // These are allowed for now..
 #define MAYBE1            *12.34
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp
index 18fe5ef4e5c2c..2c288e0bbddf9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-string-integer-assignment.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s bugprone-string-integer-assignment %t
+// RUN: %check_clang_tidy %s bugprone-string-integer-assignment %t -- -- -fno-delayed-template-parsing
 
 namespace std {
 template<typename T>
@@ -103,6 +103,8 @@ struct S {
   static constexpr T t = 0x8000;
   std::string s;
   void f(char c) { s += c | static_cast<int>(t); }
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: an integer is interpreted as a chara
+  // CHECK-FIXES: {{^}}  void f(char c) { s += std::to_string(c | static_cast<int>(t)); } 
 };
 
 template S<int>;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google-readability-namespace-comments.cpp b/clang-tools-extra/test/clang-tidy/checkers/google-readability-namespace-comments.cpp
index b4e79c97c0056..591c9dae5a74e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google-readability-namespace-comments.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google-readability-namespace-comments.cpp
@@ -25,10 +25,10 @@ void f(); // So that the namespace isn't empty.
 // 5
 // 6
 // 7
-// CHECK-MESSAGES: :[[@LINE+2]]:1: warning: namespace 'MACRO' not terminated with
-// CHECK-MESSAGES: :[[@LINE-10]]:11: note: namespace 'MACRO' starts here
+// CHECK-MESSAGES: :[[@LINE+2]]:1: warning: namespace 'macro_expansion' not terminated with
+// CHECK-MESSAGES: :[[@LINE-10]]:11: note: namespace 'macro_expansion' starts here
 }
-// CHECK-FIXES: }  // namespace MACRO
+// CHECK-FIXES: }  // namespace macro_expansion
 
 namespace short1 {
 namespace short2 {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvm-namespace-comment.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvm-namespace-comment.cpp
deleted file mode 100644
index a7d315693421d..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/llvm-namespace-comment.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: %check_clang_tidy %s llvm-namespace-comment %t
-
-namespace n1 {
-namespace n2 {
-  void f();
-
-
-  // CHECK-MESSAGES: :[[@LINE+2]]:1: warning: namespace 'n2' not terminated with a closing comment [llvm-namespace-comment]
-  // CHECK-MESSAGES: :[[@LINE+1]]:2: warning: namespace 'n1' not terminated with a closing comment [llvm-namespace-comment]
-}}
-// CHECK-FIXES: } // namespace n2
-// CHECK-FIXES: } // namespace n1
-
-#define MACRO macro_expansion
-namespace MACRO {
-  void f();
-  // CHECK-MESSAGES: :[[@LINE+1]]:1: warning: namespace 'MACRO' not terminated with a closing comment [llvm-namespace-comment]
-}
-// CHECK-FIXES: } // namespace MACRO
-
-namespace MACRO {
-  void g();
-} // namespace MACRO
-
-namespace MACRO {
-  void h();
-  // CHECK-MESSAGES: :[[@LINE+1]]:2: warning: namespace 'MACRO' ends with a comment that refers to an expansion of macro [llvm-namespace-comment]
-} // namespace macro_expansion
-// CHECK-FIXES: } // namespace MACRO
-
-namespace n1 {
-namespace MACRO {
-namespace n2 {
-  void f();
-  // CHECK-MESSAGES: :[[@LINE+3]]:1: warning: namespace 'n2' not terminated with a closing comment [llvm-namespace-comment]
-  // CHECK-MESSAGES: :[[@LINE+2]]:2: warning: namespace 'MACRO' not terminated with a closing comment [llvm-namespace-comment]
-  // CHECK-MESSAGES: :[[@LINE+1]]:3: warning: namespace 'n1' not terminated with a closing comment [llvm-namespace-comment]
-}}}
-// CHECK-FIXES: } // namespace n2
-// CHECK-FIXES: } // namespace MACRO
-// CHECK-FIXES: } // namespace n1
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc-unused-parameters.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc-unused-parameters.cpp
index 119eff67318ea..8e546b44ab74d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc-unused-parameters.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc-unused-parameters.cpp
@@ -233,7 +233,7 @@ struct a {
 template <class>
 class d {
   a e;
-  void f() { e.b(); }
+  void f() { e.b(0); }
 };
 }  // namespace
 }  // namespace PR38055
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind-permissive-parameter-list.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind-permissive-parameter-list.cpp
new file mode 100644
index 0000000000000..6c81a6e9ab97d
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind-permissive-parameter-list.cpp
@@ -0,0 +1,58 @@
+// RUN: %check_clang_tidy -std=c++14-or-later %s modernize-avoid-bind %t -- \
+// RUN:   -config="{CheckOptions: [ \
+// RUN:     {key: modernize-avoid-bind.PermissiveParameterList, value: 1}]}" --
+
+namespace std {
+inline namespace impl {
+template <class Fp, class... Arguments>
+class bind_rt {};
+
+template <class Fp, class... Arguments>
+bind_rt<Fp, Arguments...> bind(Fp &&, Arguments &&...);
+} // namespace impl
+
+template <typename T>
+T ref(T &t);
+} // namespace std
+
+int add(int x, int y) { return x + y; }
+
+// Let's fake a minimal std::function-like facility.
+namespace std {
+template <typename _Tp>
+_Tp declval();
+
+template <typename _Functor, typename... _ArgTypes>
+struct __res {
+  template <typename... _Args>
+  static decltype(declval<_Functor>()(_Args()...)) _S_test(int);
+
+  template <typename...>
+  static void _S_test(...);
+
+  using type = decltype(_S_test<_ArgTypes...>(0));
+};
+
+template <typename>
+struct function;
+
+template <typename... _ArgTypes>
+struct function<void(_ArgTypes...)> {
+  template <typename _Functor,
+            typename = typename __res<_Functor, _ArgTypes...>::type>
+  function(_Functor) {}
+};
+} // namespace std
+
+struct placeholder {};
+placeholder _1;
+
+void testLiteralParameters() {
+  auto AAA = std::bind(add, 2, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+  // CHECK-FIXES: auto AAA = [](auto && ...) { return add(2, 2); };
+
+  auto BBB = std::bind(add, _1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+  // CHECK-FIXES: auto BBB = [](auto && PH1, auto && ...) { return add(PH1, 2); };
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind.cpp
index fa60cdc2c9d08..7e00858c1acce 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize-avoid-bind.cpp
@@ -8,75 +8,62 @@ class bind_rt {};
 template <class Fp, class... Arguments>
 bind_rt<Fp, Arguments...> bind(Fp &&, Arguments &&...);
 }
+
+template <typename T>
+T ref(T &t);
 }
 
-int add(int x, int y) { return x + y; }
+namespace boost {
+template <class Fp, class... Arguments>
+class bind_rt {};
 
-void f() {
-  auto clj = std::bind(add, 2, 2);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind [modernize-avoid-bind]
-  // CHECK-FIXES: auto clj = [] { return add(2, 2); };
-}
+template <class Fp, class... Arguments>
+bind_rt<Fp, Arguments...> bind(const Fp &, Arguments...);
 
-void g() {
-  int x = 2;
-  int y = 2;
-  auto clj = std::bind(add, x, y);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // CHECK-FIXES: auto clj = [=] { return add(x, y); };
+template <class T>
+struct reference_wrapper {
+  explicit reference_wrapper(T &t) {}
+};
+
+template <class T>
+reference_wrapper<T> const ref(T &t) {
+  return reference_wrapper<T>(t);
 }
 
-struct placeholder {};
-placeholder _1;
-placeholder _2;
+} // namespace boost
 
-void h() {
-  int x = 2;
-  auto clj = std::bind(add, x, _1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // CHECK-FIXES: auto clj = [=](auto && arg1) { return add(x, arg1); };
-}
+namespace C {
+int add(int x, int y) { return x + y; }
+} // namespace C
 
-struct A;
-struct B;
-bool ABTest(const A &, const B &);
+struct Foo {
+  static int add(int x, int y) { return x + y; }
+};
 
-void i() {
-  auto BATest = std::bind(ABTest, _2, _1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: prefer a lambda to std::bind
-  // CHECK-FIXES: auto BATest = [](auto && arg1, auto && arg2) { return ABTest(arg2, arg1); };
-}
+struct D {
+  D() = default;
+  void operator()(int x, int y) const {}
 
-void j() {
-  auto clj = std::bind(add, 2, 2, 2);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // No fix is applied for argument mismatches.
-  // CHECK-FIXES: auto clj = std::bind(add, 2, 2, 2);
-}
+  void MemberFunction(int x) {}
 
-void k() {
-  auto clj = std::bind(add, _1, _1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // No fix is applied for reused placeholders.
-  // CHECK-FIXES: auto clj = std::bind(add, _1, _1);
-}
+  static D *create();
+};
 
-void m() {
-  auto clj = std::bind(add, 1, add(2, 5));
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // No fix is applied for nested calls.
-  // CHECK-FIXES: auto clj = std::bind(add, 1, add(2, 5));
-}
+struct F {
+  F(int x) {}
+  ~F() {}
 
-namespace C {
-  int add(int x, int y){ return x + y; }
-}
+  int get() { return 42; }
+};
 
-void n() {
-  auto clj = std::bind(C::add, 1, 1);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // CHECK-FIXES: auto clj = [] { return C::add(1, 1); };
-}
+void UseF(F);
+
+struct placeholder {};
+placeholder _1;
+placeholder _2;
+
+int add(int x, int y) { return x + y; }
+int addThree(int x, int y, int z) { return x + y + z; }
 
 // Let's fake a minimal std::function-like facility.
 namespace std {
@@ -114,10 +101,213 @@ struct Callback {
   void Reset(std::function<void()>);
 };
 
-void test(Thing *t) {
+int GlobalVariable = 42;
+
+struct TestCaptureByValueStruct {
+  int MemberVariable;
+  static int StaticMemberVariable;
+  F MemberStruct;
+
+  void testCaptureByValue(int Param, F f) {
+    int x = 3;
+    int y = 4;
+    auto AAA = std::bind(add, x, y);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto AAA = [x, y] { return add(x, y); };
+
+    // When the captured variable is repeated, it should only appear in the capture list once.
+    auto BBB = std::bind(add, x, x);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto BBB = [x] { return add(x, x); };
+
+    int LocalVariable;
+    // Global variables shouldn't be captured at all, and members should be captured through this.
+    auto CCC = std::bind(add, MemberVariable, GlobalVariable);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto CCC = [this] { return add(MemberVariable, GlobalVariable); };
+
+    // Static member variables shouldn't be captured, but locals should
+    auto DDD = std::bind(add, TestCaptureByValueStruct::StaticMemberVariable, LocalVariable);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto DDD = [LocalVariable] { return add(TestCaptureByValueStruct::StaticMemberVariable, LocalVariable); };
+
+    auto EEE = std::bind(add, Param, Param);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto EEE = [Param] { return add(Param, Param); };
+
+    // The signature of boost::bind() is different, and causes
+    // CXXBindTemporaryExprs to be created in certain cases.  So let's test
+    // those here.
+    auto FFF = boost::bind(UseF, f);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to boost::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto FFF = [f] { return UseF(f); };
+
+    auto GGG = boost::bind(UseF, MemberStruct);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to boost::bind [modernize-avoid-bind]
+    // CHECK-FIXES: auto GGG = [this] { return UseF(MemberStruct); };
+  }
+};
+
+void testLiteralParameters() {
+  auto AAA = std::bind(add, 2, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+  // CHECK-FIXES: auto AAA = [] { return add(2, 2); };
+
+  auto BBB = std::bind(addThree, 2, 3, 4);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind [modernize-avoid-bind]
+  // CHECK-FIXES: auto BBB = [] { return addThree(2, 3, 4); };
+}
+
+void testCaptureByReference() {
+  int x = 2;
+  int y = 2;
+  auto AAA = std::bind(add, std::ref(x), std::ref(y));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto AAA = [&x, &y] { return add(x, y); };
+
+  auto BBB = std::bind(add, std::ref(x), y);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto BBB = [&x, y] { return add(x, y); };
+
+  auto CCC = std::bind(add, y, std::ref(x));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto CCC = [y, &x] { return add(y, x); };
+
+  // Make sure it works with boost::ref() too which has slightly different
+  // semantics.
+  auto DDD = boost::bind(add, boost::ref(x), boost::ref(y));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to boost::bind
+  // CHECK-FIXES: auto DDD = [&x, &y] { return add(x, y); };
+
+  auto EEE = boost::bind(add, boost::ref(x), y);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to boost::bind
+  // CHECK-FIXES: auto EEE = [&x, y] { return add(x, y); };
+
+  auto FFF = boost::bind(add, y, boost::ref(x));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to boost::bind
+  // CHECK-FIXES: auto FFF = [y, &x] { return add(y, x); };
+}
+
+void testCaptureByInitExpression() {
+  int x = 42;
+  auto AAA = std::bind(add, x, F(x).get());
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto AAA = [x, capture0 = F(x).get()] { return add(x, capture0); };
+}
+
+void testFunctionObjects() {
+  D d;
+  D *e = nullptr;
+  auto AAA = std::bind(d, 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto AAA = [d] { return d(1, 2); }
+
+  auto BBB = std::bind(*e, 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto BBB = [e] { return (*e)(1, 2); }
+
+  auto CCC = std::bind(D{}, 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto CCC = [] { return D{}(1, 2); }
+
+  auto DDD = std::bind(D(), 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto DDD = [] { return D()(1, 2); }
+
+  auto EEE = std::bind(*D::create(), 1, 2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto EEE = [Func = *D::create()] { return Func(1, 2); };
+}
+
+void testPlaceholders() {
+  int x = 2;
+  auto AAA = std::bind(add, x, _1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto AAA = [x](auto && PH1) { return add(x, PH1); };
+
+  auto BBB = std::bind(add, _2, _1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto BBB = [](auto && PH1, auto && PH2) { return add(PH2, PH1); };
+
+  // No fix is applied for reused placeholders.
+  auto CCC = std::bind(add, _1, _1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto CCC = std::bind(add, _1, _1);
+
+  // When a placeholder is skipped, we always add skipped ones to the lambda as
+  // unnamed parameters.
+  auto DDD = std::bind(add, _2, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto DDD = [](auto &&, auto && PH2) { return add(PH2, 1); };
+}
+
+void testGlobalFunctions() {
+  auto AAA = std::bind(C::add, 1, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto AAA = [] { return C::add(1, 1); };
+
+  auto BBB = std::bind(Foo::add, 1, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto BBB = [] { return Foo::add(1, 1); };
+
+  // The & should get removed inside of the lambda body.
+  auto CCC = std::bind(&C::add, 1, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto CCC = [] { return C::add(1, 1); };
+
+  auto DDD = std::bind(&Foo::add, 1, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto DDD = [] { return Foo::add(1, 1); };
+
+  auto EEE = std::bind(&add, 1, 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // CHECK-FIXES: auto EEE = [] { return add(1, 1); };
+}
+
+void testCapturedSubexpressions() {
+  int x = 3;
+  int y = 3;
+
+  auto AAA = std::bind(add, 1, add(2, 5));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // Results of nested calls are captured by value.
+  // CHECK-FIXES: auto AAA = [capture0 = add(2, 5)] { return add(1, capture0); };
+
+  auto BBB = std::bind(add, x, add(y, 5));
+  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
+  // Results of nested calls are captured by value.
+  // CHECK-FIXES: auto BBB = [x, capture0 = add(y, 5)] { return add(x, capture0); };
+}
+
+struct E {
+  void MemberFunction(int x) {}
+
+  void testMemberFunctions() {
+    D *d;
+    D dd;
+    auto AAA = std::bind(&D::MemberFunction, d, 1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind
+    // CHECK-FIXES: auto AAA = [d] { d->MemberFunction(1); };
+
+    auto BBB = std::bind(&D::MemberFunction, &dd, 1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind
+    // CHECK-FIXES: auto BBB = [ObjectPtr = &dd] { ObjectPtr->MemberFunction(1); };
+
+    auto CCC = std::bind(&E::MemberFunction, this, 1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind
+    // CHECK-FIXES: auto CCC = [this] { MemberFunction(1); };
+
+    // Test what happens when the object pointer is itself a placeholder.
+    auto DDD = std::bind(&D::MemberFunction, _1, 1);
+    // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer a lambda to std::bind
+    // CHECK-FIXES: auto DDD = [](auto && PH1) { PH1->MemberFunction(1); };
+  }
+};
+
+void testStdFunction(Thing *t) {
   Callback cb;
   if (t)
     cb.Reset(std::bind(UseThing, t));
   // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: prefer a lambda to std::bind
-  // CHECK-FIXES: cb.Reset([=] { return UseThing(t); });
+  // CHECK-FIXES: cb.Reset([t] { return UseThing(t); });
 }
diff --git a/clang/bindings/python/tests/CMakeLists.txt b/clang/bindings/python/tests/CMakeLists.txt
index 3f5ac957f81d4..626256af9c1b6 100644
--- a/clang/bindings/python/tests/CMakeLists.txt
+++ b/clang/bindings/python/tests/CMakeLists.txt
@@ -32,6 +32,11 @@ if(WIN32)
   set(RUN_PYTHON_TESTS FALSE)
 endif()
 
+# The Python FFI interface is broken on AIX: https://bugs.python.org/issue38628.
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
 # AArch64, Hexagon, and Sparc have known test failures that need to be
 # addressed.
 # SystemZ has broken Python/FFI interface:
diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst
index 492eec71f2e4e..e8d561fae9564 100644
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@@ -2430,31 +2430,10 @@ Enable XNACK (AMDGPU only)
 
 ARM
 ---
-
-.. option:: -ffixed-r6
-
-Reserve the r6 register (ARM only)
-
-.. option:: -ffixed-r7
-
-Reserve the r7 register (ARM only)
-
-.. option:: -ffixed-r8
-
-Reserve the r8 register (ARM only)
-
 .. option:: -ffixed-r9
 
 Reserve the r9 register (ARM only)
 
-.. option:: -ffixed-r10
-
-Reserve the r10 register (ARM only)
-
-.. option:: -ffixed-r11
-
-Reserve the r11 register (ARM only)
-
 .. option:: -mexecute-only, -mno-execute-only, -mpure-code
 
 Disallow generation of data access to code sections (ARM only)
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index f438ec7f871b7..2f7483435fd4f 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -2328,6 +2328,9 @@ the configuration (without a prefix: ``Auto``).
      true:                                  false:
      x = ( int32 )y                 vs.     x = (int32)y
 
+**SpacesInConditionalStatement** (``bool``)
+  If ``true``, spaces will be inserted around if/for/while (and similar) conditions.
+
 **SpacesInContainerLiterals** (``bool``)
   If ``true``, spaces are inserted inside container literals (e.g.
   ObjC and Javascript array and dict literals).
diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst
index d4fb8f6f34aa8..a86161c8fa011 100644
--- a/clang/docs/ConstantInterpreter.rst
+++ b/clang/docs/ConstantInterpreter.rst
@@ -10,8 +10,7 @@ Introduction
 
 The constexpr interpreter aims to replace the existing tree evaluator in clang, improving performance on constructs which are executed inefficiently by the evaluator. The interpreter is activated using the following flags:
 
-* ``-fexperimental-new-constant-interpreter`` enables the interpreter, falling back to the evaluator for unsupported features
-* ``-fforce-experimental-new-constant-interpreter`` forces the use of the interpreter, bailing out if an unsupported feature is encountered
+* ``-fexperimental-new-constant-interpreter`` enables the interpreter, emitting an error if an unsupported feature is encountered
 
 Bytecode Compilation
 ====================
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index ee80f1afb9e62..6c113fa6b43fe 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -153,7 +153,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | task extension               | mutexinoutset dependence-type for tasks                      | :good:`done`             | D53380,D57576                                                         |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| task extension               | combined taskloop constructs                                 | :none:`unclaimed`        |                                                                       |
+| task extension               | combined taskloop constructs                                 | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | task extension               | master taskloop                                              | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -173,9 +173,9 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device extension             | OMP_TARGET_OFFLOAD environment variable                      | :good:`done`             | D50522                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device extension             | support full 'defaultmap' functionality                      | :part:`worked on`        | D69204                                                                |
+| device extension             | support full 'defaultmap' functionality                      | :good:`done`             | D69204                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device extension             | device specific functions                                    | :none:`unclaimed`        |                                                                       |
+| device extension             | device specific functions                                    | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device extension             | clause: device_type                                          | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
@@ -191,9 +191,9 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device extension             | allow access to the reference count (omp_target_is_present)  | :part:`worked on`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device extension             | requires directive (unified shared memory)                   | :part:`worked on`        |                                                                       |
+| device extension             | requires directive (unified shared memory)                   | :good:`done`             |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device extension             | clause: unified_address, unified_shared_memory               | :part:`worked on`        | D52625,D52359                                                         |
+| device extension             | clause: unified_address, unified_shared_memory               | :good:`done`             | D52625,D52359                                                         |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device extension             | clause: reverse_offload                                      | :none:`unclaimed parts`  | D52780                                                                |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4ac300deb589a..37a8f30e0bc9c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -78,6 +78,10 @@ Non-comprehensive list of changes in this release
   been extended to detect these cases, so that code relying on them can be
   detected and fixed.
 
+* The Implicit Conversion Sanitizer (``-fsanitize=implicit-conversion``) has
+  learned to sanitize pre/post increment/decrement of types with bit width
+  smaller than ``int``.
+
 - For X86 target, -march=skylake-avx512, -march=icelake-client,
   -march=icelake-server, -march=cascadelake, -march=cooperlake will default to
   not using 512-bit zmm registers in vectorized code unless 512-bit intrinsics
diff --git a/clang/docs/SourceBasedCodeCoverage.rst b/clang/docs/SourceBasedCodeCoverage.rst
index 73197a57713f9..7e711819be34a 100644
--- a/clang/docs/SourceBasedCodeCoverage.rst
+++ b/clang/docs/SourceBasedCodeCoverage.rst
@@ -302,3 +302,37 @@ Drawbacks and limitations
   If the call to ``may_throw()`` propagates an exception into ``f``, the code
   coverage tool may mark the ``return`` statement as executed even though it is
   not. A call to ``longjmp()`` can have similar effects.
+
+Clang implementation details
+============================
+
+This section may be of interest to those wishing to understand or improve
+the clang code coverage implementation.
+
+Gap regions
+-----------
+
+Gap regions are source regions with counts. A reporting tool cannot set a line
+execution count to the count from a gap region unless that region is the only
+one on a line.
+
+Gap regions are used to eliminate unnatural artifacts in coverage reports, such
+as red "unexecuted" highlights present at the end of an otherwise covered line,
+or blue "executed" highlights present at the start of a line that is otherwise
+not executed.
+
+Switch statements
+-----------------
+
+The region mapping for a switch body consists of a gap region that covers the
+entire body (starting from the '{' in 'switch (...) {', and terminating where the
+last case ends). This gap region has a zero count: this causes "gap" areas in
+between case statements, which contain no executable code, to appear uncovered.
+
+When a switch case is visited, the parent region is extended: if the parent
+region has no start location, its start location becomes the start of the case.
+This is used to support switch statements without a ``CompoundStmt`` body, in
+which the switch body and the single case share a count.
+
+For switches with ``CompoundStmt`` bodies, a new region is created at the start
+of each switch case.
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 714681d7f4cea..62e2575c6b26e 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -1231,10 +1231,10 @@ are listed below.
 
 **-f[no-]trapping-math**
 
-   ``-fno-trapping-math`` allows optimizations that assume that
-   floating point operations cannot generate traps such as divide-by-zero,
-   overflow and underflow. Defaults to ``-ftrapping-math``.
-   Currently this option has no effect.
+   Control floating point exception behavior. ``-fno-trapping-math`` allows optimizations that assume that floating point operations cannot generate traps such as divide-by-zero, overflow and underflow.
+
+- The option ``-ftrapping-math`` behaves identically to ``-ffp-exception-behavior=strict``.
+- The option ``-fno-trapping-math`` behaves identically to ``-ffp-exception-behavior=ignore``.   This is the default.
 
 .. option:: -ffp-contract=<value>
 
@@ -1319,6 +1319,52 @@ are listed below.
 
    Defaults to ``-fno-finite-math``.
 
+.. _opt_frounding-math:
+
+**-f[no-]rounding-math**
+
+Force floating-point operations to honor the dynamically-set rounding mode by default.
+
+The result of a floating-point operation often cannot be exactly represented in the result type and therefore must be rounded.  IEEE 754 describes different rounding modes that control how to perform this rounding, not all of which are supported by all implementations.  C provides interfaces (``fesetround`` and ``fesetenv``) for dynamically controlling the rounding mode, and while it also recommends certain conventions for changing the rounding mode, these conventions are not typically enforced in the ABI.  Since the rounding mode changes the numerical result of operations, the compiler must understand something about it in order to optimize floating point operations.
+
+Note that floating-point operations performed as part of constant initialization are formally performed prior to the start of the program and are therefore not subject to the current rounding mode.  This includes the initialization of global variables and local ``static`` variables.  Floating-point operations in these contexts will be rounded using ``FE_TONEAREST``.
+
+- The option ``-fno-rounding-math`` allows the compiler to assume that the rounding mode is set to ``FE_TONEAREST``.  This is the default.
+- The option ``-frounding-math`` forces the compiler to honor the dynamically-set rounding mode.  This prevents optimizations which might affect results if the rounding mode changes or is different from the default; for example, it prevents floating-point operations from being reordered across most calls and prevents constant-folding when the result is not exactly representable.
+
+.. option:: -ffp-model=<value>
+
+   Specify floating point behavior. ``-ffp-model`` is an umbrella
+   option that encompasses functionality provided by other, single
+   purpose, floating point options.  Valid values are: ``precise``, ``strict``,
+   and ``fast``.
+   Details:
+
+   * ``precise`` Disables optimizations that are not value-safe on floating-point data, although FP contraction (FMA) is enabled (``-ffp-contract=fast``).  This is the default behavior.
+   * ``strict`` Enables ``-frounding-math`` and ``-ffp-exception-behavior=strict``, and disables contractions (FMA).  All of the ``-ffast-math`` enablements are disabled.
+   * ``fast`` Behaves identically to specifying both ``-ffast-math`` and ``ffp-contract=fast``
+
+   Note: If your command line specifies multiple instances
+   of the ``-ffp-model`` option, or if your command line option specifies
+   ``-ffp-model`` and later on the command line selects a floating point
+   option that has the effect of negating part of the  ``ffp-model`` that
+   has been selected, then the compiler will issue a diagnostic warning
+   that the override has occurred.
+
+.. option:: -ffp-exception-behavior=<value>
+
+   Specify the floating-point exception behavior.
+
+   Valid values are: ``ignore``, ``maytrap``, and ``strict``.
+   The default value is ``ignore``.  Details:
+
+   * ``ignore`` The compiler assumes that the exception status flags will not be read and that floating point exceptions will be masked.
+   * ``maytrap`` The compiler avoids transformations that may raise exceptions that would not have been raised by the original code. Constant folding performed by the compiler is exempt from this option.
+   * ``strict`` The compiler ensures that all transformations strictly preserve the floating point exception semantics of the original code.
+
+
+
+
 .. _controlling-code-generation:
 
 Controlling Code Generation
diff --git a/clang/examples/clang-interpreter/main.cpp b/clang/examples/clang-interpreter/main.cpp
index 6ac142bffdffc..db6b0cce4fd17 100644
--- a/clang/examples/clang-interpreter/main.cpp
+++ b/clang/examples/clang-interpreter/main.cpp
@@ -54,6 +54,7 @@ class SimpleJIT {
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
   MangleAndInterner Mangle{ES, DL};
+  JITDylib &MainJD{ES.createJITDylib("<main>")};
   RTDyldObjectLinkingLayer ObjectLayer{ES, createMemMgr};
   IRCompileLayer CompileLayer{ES, ObjectLayer, SimpleCompiler(*TM)};
 
@@ -66,7 +67,7 @@ class SimpleJIT {
       std::unique_ptr<DynamicLibrarySearchGenerator> ProcessSymbolsGenerator)
       : TM(std::move(TM)), DL(std::move(DL)) {
     llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
-    ES.getMainJITDylib().addGenerator(std::move(ProcessSymbolsGenerator));
+    MainJD.addGenerator(std::move(ProcessSymbolsGenerator));
   }
 
 public:
@@ -95,11 +96,11 @@ class SimpleJIT {
   const TargetMachine &getTargetMachine() const { return *TM; }
 
   Error addModule(ThreadSafeModule M) {
-    return CompileLayer.add(ES.getMainJITDylib(), std::move(M));
+    return CompileLayer.add(MainJD, std::move(M));
   }
 
   Expected<JITEvaluatedSymbol> findSymbol(const StringRef &Name) {
-    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name));
+    return ES.lookup({&MainJD}, Mangle(Name));
   }
 
   Expected<JITTargetAddress> getSymbolAddress(const StringRef &Name) {
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 31adfc5c368a6..f4913540bab4d 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -1964,6 +1964,14 @@ class FunctionDecl : public DeclaratorDecl,
 
   void setRangeEnd(SourceLocation E) { EndRangeLoc = E; }
 
+  /// Returns the location of the ellipsis of a variadic function.
+  SourceLocation getEllipsisLoc() const {
+    const auto *FPT = getType()->getAs<FunctionProtoType>();
+    if (FPT && FPT->isVariadic())
+      return FPT->getEllipsisLoc();
+    return SourceLocation();
+  }
+
   SourceRange getSourceRange() const override LLVM_READONLY;
 
   // Function definitions.
@@ -2188,6 +2196,10 @@ class FunctionDecl : public DeclaratorDecl,
   bool usesSEHTry() const { return FunctionDeclBits.UsesSEHTry; }
   void setUsesSEHTry(bool UST) { FunctionDeclBits.UsesSEHTry = UST; }
 
+  /// Indicates the function uses Floating Point constrained intrinsics
+  bool usesFPIntrin() const { return FunctionDeclBits.UsesFPIntrin; }
+  void setUsesFPIntrin(bool Val) { FunctionDeclBits.UsesFPIntrin = Val; }
+
   /// Whether this function has been deleted.
   ///
   /// A function that is "deleted" (via the C++0x "= delete" syntax)
@@ -2388,6 +2400,12 @@ class FunctionDecl : public DeclaratorDecl,
   /// limited representation in the AST.
   SourceRange getReturnTypeSourceRange() const;
 
+  /// Attempt to compute an informative source range covering the
+  /// function parameters, including the ellipsis of a variadic function.
+  /// The source range excludes the parentheses, and is invalid if there are
+  /// no parameters and no ellipsis.
+  SourceRange getParametersSourceRange() const;
+
   /// Get the declared return type, which may differ from the actual return
   /// type if the return type is deduced.
   QualType getDeclaredReturnType() const {
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index adea10b33188b..54cdb84b6f330 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1534,10 +1534,13 @@ class DeclContext {
 
     /// Store the ODRHash after first calculation.
     uint64_t HasODRHash : 1;
+
+    /// Indicates if the function uses Floating Point Constrained Intrinsics
+    uint64_t UsesFPIntrin : 1;
   };
 
   /// Number of non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = 25 };
+  enum { NumFunctionDeclBits = 26 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
@@ -1554,7 +1557,7 @@ class DeclContext {
     /// exactly 64 bits and thus the width of NumCtorInitializers
     /// will need to be shrunk if some bit is added to NumDeclContextBitfields,
     /// NumFunctionDeclBitfields or CXXConstructorDeclBitfields.
-    uint64_t NumCtorInitializers : 23;
+    uint64_t NumCtorInitializers : 22;
     uint64_t IsInheritingConstructor : 1;
 
     /// Whether this constructor has a trail-allocated explicit specifier.
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 63d67bd3f55b2..0f2018fb9e8cb 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -3041,7 +3041,9 @@ class NamespaceAliasDecl : public NamedDecl,
 
 /// Implicit declaration of a temporary that was materialized by
 /// a MaterializeTemporaryExpr and lifetime-extended by a declaration
-class LifetimeExtendedTemporaryDecl final : public Decl {
+class LifetimeExtendedTemporaryDecl final
+    : public Decl,
+      public Mergeable<LifetimeExtendedTemporaryDecl> {
   friend class MaterializeTemporaryExpr;
   friend class ASTDeclReader;
 
diff --git a/clang/include/clang/AST/JSONNodeDumper.h b/clang/include/clang/AST/JSONNodeDumper.h
index 37ab8c084e57c..4023e023e9d56 100644
--- a/clang/include/clang/AST/JSONNodeDumper.h
+++ b/clang/include/clang/AST/JSONNodeDumper.h
@@ -126,7 +126,7 @@ class JSONNodeDumper
   ASTNameGenerator ASTNameGen;
   PrintingPolicy PrintPolicy;
   const comments::CommandTraits *Traits;
-  StringRef LastLocFilename;
+  StringRef LastLocFilename, LastLocPresumedFilename;
   unsigned LastLocLine, LastLocPresumedLine;
 
   using InnerAttrVisitor = ConstAttrVisitor<JSONNodeDumper>;
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index 0ff5a614a864d..d293ea190aa43 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -346,6 +346,8 @@ class TextNodeDumper
   void VisitObjCPropertyImplDecl(const ObjCPropertyImplDecl *D);
   void VisitBlockDecl(const BlockDecl *D);
   void VisitConceptDecl(const ConceptDecl *D);
+  void
+  VisitLifetimeExtendedTemporaryDecl(const LifetimeExtendedTemporaryDecl *D);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index c047171730ba7..b15881a682ace 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2077,6 +2077,8 @@ class alignas(8) Type : public ExtQualsTypeCommonBase {
   bool isAlignValT() const;                     // C++17 std::align_val_t
   bool isStdByteType() const;                   // C++17 std::byte
   bool isAtomicType() const;                    // C11 _Atomic()
+  bool isUndeducedAutoType() const;             // C++11 auto or
+                                                // C++14 decltype(auto)
 
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
   bool is##Id##Type() const;
@@ -3734,9 +3736,9 @@ class FunctionProtoType final
     : public FunctionType,
       public llvm::FoldingSetNode,
       private llvm::TrailingObjects<
-          FunctionProtoType, QualType, FunctionType::FunctionTypeExtraBitfields,
-          FunctionType::ExceptionType, Expr *, FunctionDecl *,
-          FunctionType::ExtParameterInfo, Qualifiers> {
+          FunctionProtoType, QualType, SourceLocation,
+          FunctionType::FunctionTypeExtraBitfields, FunctionType::ExceptionType,
+          Expr *, FunctionDecl *, FunctionType::ExtParameterInfo, Qualifiers> {
   friend class ASTContext; // ASTContext creates these.
   friend TrailingObjects;
 
@@ -3747,6 +3749,9 @@ class FunctionProtoType final
   //   Always present. Note that for the vast majority of FunctionProtoType,
   //   these will be the only trailing objects.
   //
+  // * Optionally if the function is variadic, the SourceLocation of the
+  //   ellipsis.
+  //
   // * Optionally if some extra data is stored in FunctionTypeExtraBitfields
   //   (see FunctionTypeExtraBitfields and FunctionTypeBitfields):
   //   a single FunctionTypeExtraBitfields. Present if and only if
@@ -3818,6 +3823,7 @@ class FunctionProtoType final
     RefQualifierKind RefQualifier = RQ_None;
     ExceptionSpecInfo ExceptionSpec;
     const ExtParameterInfo *ExtParameterInfos = nullptr;
+    SourceLocation EllipsisLoc;
 
     ExtProtoInfo() : Variadic(false), HasTrailingReturn(false) {}
 
@@ -3836,6 +3842,10 @@ class FunctionProtoType final
     return getNumParams();
   }
 
+  unsigned numTrailingObjects(OverloadToken<SourceLocation>) const {
+    return isVariadic();
+  }
+
   unsigned numTrailingObjects(OverloadToken<FunctionTypeExtraBitfields>) const {
     return hasExtraBitfields();
   }
@@ -3947,6 +3957,7 @@ class FunctionProtoType final
     ExtProtoInfo EPI;
     EPI.ExtInfo = getExtInfo();
     EPI.Variadic = isVariadic();
+    EPI.EllipsisLoc = getEllipsisLoc();
     EPI.HasTrailingReturn = hasTrailingReturn();
     EPI.ExceptionSpec.Type = getExceptionSpecType();
     EPI.TypeQuals = getMethodQuals();
@@ -4048,6 +4059,11 @@ class FunctionProtoType final
   /// Whether this function prototype is variadic.
   bool isVariadic() const { return FunctionTypeBits.Variadic; }
 
+  SourceLocation getEllipsisLoc() const {
+    return isVariadic() ? *getTrailingObjects<SourceLocation>()
+                        : SourceLocation();
+  }
+
   /// Determines whether this function prototype contains a
   /// parameter pack at the end.
   ///
@@ -6517,6 +6533,10 @@ inline bool Type::isAtomicType() const {
   return isa<AtomicType>(CanonicalType);
 }
 
+inline bool Type::isUndeducedAutoType() const {
+  return isa<AutoType>(CanonicalType);
+}
+
 inline bool Type::isObjCQualifiedIdType() const {
   if (const auto *OPT = getAs<ObjCObjectPointerType>())
     return OPT->isObjCQualifiedIdType();
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index f305680d775cf..7f1d429ac3b42 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -173,9 +173,6 @@ class TypeLoc {
 
   TypeLoc IgnoreParens() const;
 
-  /// Strips MacroDefinitionTypeLocs from a type location.
-  TypeLoc IgnoreMacroDefinitions() const;
-
   /// Find a type with the location of an explicit type qualifier.
   ///
   /// The result, if non-null, will be one of:
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 298f688f8c0ad..72564720b7db2 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1107,6 +1107,7 @@ def SYCLDeviceIndirectlyCallable : InheritableAttr {
   let LangOpts = [SYCLIsDevice];
   let Documentation = [SYCLDeviceIndirectlyCallableDocs];
 }
+
 def SYCLIntelKernelArgsRestrict : InheritableAttr {
   let Spellings = [ CXX11<"intel", "kernel_args_restrict"> ];
   let Subjects = SubjectList<[Function], ErrorDiag>;
@@ -1193,27 +1194,27 @@ def OpenCLAccess : Attr {
 }
 
 def OpenCLPrivateAddressSpace : TypeAttr {
-  let Spellings = [Keyword<"__private">, Keyword<"private">, Clang<"ocl_private">];
+  let Spellings = [Keyword<"__private">, Keyword<"private">, Clang<"opencl_private">];
   let Documentation = [OpenCLAddressSpacePrivateDocs];
 }
 
 def OpenCLGlobalAddressSpace : TypeAttr {
-  let Spellings = [Keyword<"__global">, Keyword<"global">, Clang<"ocl_global">];
+  let Spellings = [Keyword<"__global">, Keyword<"global">, Clang<"opencl_global">];
   let Documentation = [OpenCLAddressSpaceGlobalDocs];
 }
 
 def OpenCLLocalAddressSpace : TypeAttr {
-  let Spellings = [Keyword<"__local">, Keyword<"local">, Clang<"ocl_local">];
+  let Spellings = [Keyword<"__local">, Keyword<"local">, Clang<"opencl_local">];
   let Documentation = [OpenCLAddressSpaceLocalDocs];
 }
 
 def OpenCLConstantAddressSpace : TypeAttr {
-  let Spellings = [Keyword<"__constant">, Keyword<"constant">, Clang<"ocl_constant">];
+  let Spellings = [Keyword<"__constant">, Keyword<"constant">, Clang<"opencl_constant">];
   let Documentation = [OpenCLAddressSpaceConstantDocs];
 }
 
 def OpenCLGenericAddressSpace : TypeAttr {
-  let Spellings = [Keyword<"__generic">, Keyword<"generic">, Clang<"ocl_generic">];
+  let Spellings = [Keyword<"__generic">, Keyword<"generic">, Clang<"opencl_generic">];
   let Documentation = [OpenCLAddressSpaceGenericDocs];
 }
 
@@ -3676,20 +3677,40 @@ def OMPDeclareVariant : InheritableAttr {
       }
       // TODO: add printing of real context selectors.
       OS << " match(";
+      int Used[OMP_CTX_SET_unknown] = {0};
       for (unsigned I = 0, E = ctxSelectorSets_size(); I < E; ++I) {
         auto CtxSet = static_cast<OpenMPContextSelectorSetKind>(
             *std::next(ctxSelectorSets_begin(), I));
-        auto Ctx = static_cast<OpenMPContextSelectorKind>(
-            *std::next(ctxSelectors_begin(), I));
-        assert(CtxSet != OMP_CTX_SET_unknown && Ctx != OMP_CTX_unknown &&
-               "Unknown context selector.");
+        if (Used[CtxSet])
+          continue;
+        if (I > 0)
+          OS << ",";
         switch (CtxSet) {
         case OMP_CTX_SET_implementation:
           OS << "implementation={";
+          break;
+        case OMP_CTX_SET_device:
+          OS << "device={";
+          break;
+        case OMP_CTX_SET_unknown:
+          llvm_unreachable("Unknown context selector set.");
+        }
+        Used[CtxSet] = 1;
+        for (unsigned K = I, EK = ctxSelectors_size(); K < EK; ++K) {
+          auto CtxSetK = static_cast<OpenMPContextSelectorSetKind>(
+              *std::next(ctxSelectorSets_begin(), K));
+          if (CtxSet != CtxSetK)
+            continue;
+          if (K != I)
+            OS << ",";
+          auto Ctx = static_cast<OpenMPContextSelectorKind>(
+              *std::next(ctxSelectors_begin(), K));
           switch (Ctx) {
           case OMP_CTX_vendor:
+            assert(CtxSet == OMP_CTX_SET_implementation &&
+                   "Expected implementation context selector set.");
             OS << "vendor(";
-            printScore(OS, Policy, I);
+            printScore(OS, Policy, K);
             if (implVendors_size() > 0) {
               OS << *implVendors(). begin();
               for (StringRef VendorName : llvm::drop_begin(implVendors(), 1))
@@ -3698,16 +3719,8 @@ def OMPDeclareVariant : InheritableAttr {
             OS << ")";
             break;
           case OMP_CTX_kind:
-            llvm_unreachable("Unexpected context selector in implementation set.");
-          case OMP_CTX_unknown:
-            llvm_unreachable("Unknown context selector.");
-          }
-          OS << "}";
-          break;
-        case OMP_CTX_SET_device:
-          OS << "device={";
-          switch (Ctx) {
-          case OMP_CTX_kind:
+            assert(CtxSet == OMP_CTX_SET_device &&
+                   "Expected device context selector set.");
             OS << "kind(";
             if (deviceKinds_size() > 0) {
               OS << *deviceKinds().begin();
@@ -3716,18 +3729,11 @@ def OMPDeclareVariant : InheritableAttr {
             }
             OS << ")";
             break;
-          case OMP_CTX_vendor:
-            llvm_unreachable("Unexpected context selector in device set.");
           case OMP_CTX_unknown:
             llvm_unreachable("Unknown context selector.");
           }
-          OS << "}";
-          break;
-        case OMP_CTX_SET_unknown:
-          llvm_unreachable("Unknown context selector set.");
         }
-        if (I != E - 1)
-          OS << ",";
+        OS << "}";
       }
       OS << ")";
     }
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 933dc24e97bf0..aabb5466314aa 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -149,8 +149,8 @@ def err_drv_missing_arg_mtp : Error<
   "missing argument to '%0'">;
 def err_drv_invalid_libcxx_deployment : Error<
   "invalid deployment target for -stdlib=libc++ (requires %0 or later)">;
-def err_drv_invalid_argument_to_fdebug_prefix_map : Error<
-  "invalid argument '%0' to -fdebug-prefix-map">;
+def err_drv_invalid_argument_to_option : Error<
+  "invalid argument '%0' to -%1">;
 def err_drv_malformed_sanitizer_blacklist : Error<
   "malformed sanitizer blacklist: '%0'">;
 def err_drv_duplicate_config : Error<
@@ -454,6 +454,10 @@ def warn_drv_experimental_isel_incomplete_opt : Warning<
   "-fexperimental-isel support is incomplete for this architecture at the current optimization level">,
   InGroup<ExperimentalISel>;
 
+def warn_drv_experimental_fp_control_incomplete_opt : Warning<
+  "Support for floating point control option %0 is incomplete and experimental">,
+  InGroup<ExperimentalFloatControl>;
+
 def warn_drv_moutline_unsupported_opt : Warning<
   "The '%0' architecture does not support -moutline; flag ignored">,
   InGroup<OptionIgnored>;
@@ -477,10 +481,6 @@ def warn_drv_msp430_hwmult_no_device : Warning<"no MCU device specified, but "
   "specify a MSP430 device, or -mhwmult to set hardware multiply type "
   "explicitly.">, InGroup<InvalidCommandLineArgument>;
 
-// Frame pointer reservation.
-def err_reserved_frame_pointer : Error<
-  "'%0' has been specified but '%1' is used as the frame pointer for this target">;
-
 def warn_drv_libstdcxx_not_found : Warning<
   "include path for libstdc++ headers not found; pass '-stdlib=libc++' on the "
   "command line to use the libc++ standard library instead">,
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 31307b6aaf5fe..d8669c0336831 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -128,6 +128,8 @@ def CXX11CompatDeprecatedWritableStr :
 
 def DeprecatedAttributes : DiagGroup<"deprecated-attributes">;
 def DeprecatedCommaSubscript : DiagGroup<"deprecated-comma-subscript">;
+def DeprecatedCopy : DiagGroup<"deprecated-copy">;
+def DeprecatedCopyDtor : DiagGroup<"deprecated-copy-dtor">;
 def DeprecatedDeclarations : DiagGroup<"deprecated-declarations">;
 def UnavailableDeclarations : DiagGroup<"unavailable-declarations">;
 def UnguardedAvailabilityNew : DiagGroup<"unguarded-availability-new">;
@@ -147,6 +149,8 @@ def DeprecatedWritableStr : DiagGroup<"deprecated-writable-strings",
 // FIXME: Why is DeprecatedImplementations not in this group?
 def Deprecated : DiagGroup<"deprecated", [DeprecatedAttributes,
                                           DeprecatedCommaSubscript,
+                                          DeprecatedCopy,
+                                          DeprecatedCopyDtor,
                                           DeprecatedDeclarations,
                                           DeprecatedDynamicExceptionSpec,
                                           DeprecatedIncrementBool,
@@ -689,6 +693,7 @@ def ZeroLengthArray : DiagGroup<"zero-length-array">;
 def GNUZeroLineDirective : DiagGroup<"gnu-zero-line-directive">;
 def GNUZeroVariadicMacroArguments : DiagGroup<"gnu-zero-variadic-macro-arguments">;
 def Fallback : DiagGroup<"fallback">;
+def MisleadingIndentation : DiagGroup<"misleading-indentation">;
 
 // This covers both the deprecated case (in C++98)
 // and the extension case (in C++11 onwards).
@@ -812,6 +817,7 @@ def Move : DiagGroup<"move", [
   ]>;
 
 def Extra : DiagGroup<"extra", [
+    DeprecatedCopy,
     MissingFieldInitializers,
     IgnoredQualifiers,
     InitializerOverrides,
@@ -879,7 +885,7 @@ def Consumed       : DiagGroup<"consumed">;
 // Note that putting warnings in -Wall will not disable them by default. If a
 // warning should be active _only_ when -Wall is passed in, mark it as
 // DefaultIgnore in addition to putting it here.
-def All : DiagGroup<"all", [Most, Parentheses, Switch, SwitchBool]>;
+def All : DiagGroup<"all", [Most, Parentheses, Switch, SwitchBool, MisleadingIndentation]>;
 
 // Warnings that should be in clang-cl /w4.
 def : DiagGroup<"CL4", [All, Extra]>;
@@ -1107,6 +1113,9 @@ def SpirCompat : DiagGroup<"spir-compat">;
 // Warning for the experimental-isel options.
 def ExperimentalISel : DiagGroup<"experimental-isel">;
 
+// Warning for the experimental float control options.
+def ExperimentalFloatControl : DiagGroup<"experimental-float-control">;
+
 // A warning group specifically for warnings related to function
 // multiversioning.
 def FunctionMultiVersioning : DiagGroup<"function-multiversion">;
@@ -1118,9 +1127,6 @@ def CrossTU : DiagGroup<"ctu">;
 
 def CTADMaybeUnsupported : DiagGroup<"ctad-maybe-unsupported">;
 
-def FortifySource : DiagGroup<"fortify-source">;
-
 def IntelFPGA : DiagGroup<"intel-fpga">;
 
-// Register reservation.
-def FixedRegs : DiagGroup<"fixed-registers">;
+def FortifySource : DiagGroup<"fortify-source">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index fe0ad6ed14786..3d57942443817 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -61,6 +61,13 @@ def warn_null_statement : Warning<
   "remove unnecessary ';' to silence this warning">,
   InGroup<ExtraSemiStmt>, DefaultIgnore;
 
+def warn_misleading_indentation : Warning<
+  "misleading indentation; statement is not part of "
+  "the previous '%select{if|else|for|while|else if}0'">,
+  InGroup<MisleadingIndentation>, DefaultIgnore;
+def note_previous_statement : Note<
+  "previous statement is here">;
+
 def ext_thread_before : Extension<"'__thread' before '%0'">;
 def ext_keyword_as_ident : ExtWarn<
   "keyword '%0' will be made available as an identifier "
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 143cd137336f1..3e9e163b059ff 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -573,9 +573,13 @@ def err_access_decl : Error<
   "use using declarations instead">;
 def warn_deprecated_copy_operation : Warning<
   "definition of implicit copy %select{constructor|assignment operator}1 "
-  "for %0 is deprecated because it has a user-declared "
-  "%select{copy %select{assignment operator|constructor}1|destructor}2">,
-  InGroup<Deprecated>, DefaultIgnore;
+  "for %0 is deprecated because it has a user-declared copy "
+  "%select{assignment operator|constructor}1">,
+  InGroup<DeprecatedCopy>, DefaultIgnore;
+def warn_deprecated_copy_dtor_operation : Warning<
+  "definition of implicit copy %select{constructor|assignment operator}1 "
+  "for %0 is deprecated because it has a user-declared destructor">,
+  InGroup<DeprecatedCopyDtor>, DefaultIgnore;
 def warn_cxx17_compat_exception_spec_in_signature : Warning<
   "mangled name of %0 will change in C++17 due to non-throwing exception "
   "specification in function signature">, InGroup<CXX17CompatMangling>;
@@ -7771,8 +7775,6 @@ let CategoryName = "Inline Assembly Issue" in {
   def err_asm_unknown_register_name : Error<"unknown register name '%0' in asm">;
   def err_asm_invalid_global_var_reg : Error<"register '%0' unsuitable for "
     "global register variables on this target">;
-  def err_asm_missing_fixed_reg_opt : Error<"-ffixed-%0 is required for "
-    "global named register variable declaration">;
   def err_asm_register_size_mismatch : Error<"size of register '%0' does not "
     "match variable size">;
   def err_asm_bad_register_type : Error<"bad type for named register variable">;
@@ -8770,6 +8772,12 @@ def err_32_bit_builtin_64_bit_tgt : Error<
   "this builtin is only available on 32-bit targets">;
 def err_builtin_x64_aarch64_only : Error<
   "this builtin is only available on x86-64 and aarch64 targets">;
+def err_mips_builtin_requires_dsp : Error<
+  "this builtin requires 'dsp' ASE, please use -mdsp">;
+def err_mips_builtin_requires_dspr2 : Error<
+  "this builtin requires 'dsp r2' ASE, please use -mdspr2">;
+def err_mips_builtin_requires_msa : Error<
+  "this builtin requires 'msa' ASE, please use -mmsa">;
 def err_ppc_builtin_only_on_pwr7 : Error<
   "this builtin is only valid on POWER7 or later CPUs">;
 def err_x86_builtin_invalid_rounding : Error<
@@ -9320,7 +9328,7 @@ def ext_omp_loop_not_canonical_init : ExtWarn<
   "('var = init' or 'T var = init')">, InGroup<OpenMPLoopForm>;
 def err_omp_loop_not_canonical_cond : Error<
   "condition of OpenMP for loop must be a relational comparison "
-	"('<', '<=', '>', %select{or '>='|'>=', or '!='}0) of loop variable %1">;
+  "('<', '<=', '>', %select{or '>='|'>=', or '!='}0) of loop variable %1">;
 def err_omp_loop_not_canonical_incr : Error<
   "increment clause of OpenMP for loop must perform simple addition "
   "or subtraction on loop variable %0">;
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 01759f45b227a..dad63a0088485 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -259,6 +259,8 @@ LANGOPT(SinglePrecisionConstants , 1, 0, "treating double-precision floating poi
 LANGOPT(FastRelaxedMath , 1, 0, "OpenCL fast relaxed math")
 /// FP_CONTRACT mode (on/off/fast).
 ENUM_LANGOPT(DefaultFPContractMode, FPContractModeKind, 2, FPC_Off, "FP contraction type")
+ENUM_LANGOPT(FPRoundingMode, FPRoundingModeKind, 3, FPR_ToNearest, "FP Rounding Mode type")
+ENUM_LANGOPT(FPExceptionMode, FPExceptionModeKind, 2, FPE_Ignore, "FP Exception Behavior Mode type")
 LANGOPT(NoBitFieldTypeAlign , 1, 0, "bit-field type alignment")
 LANGOPT(HexagonQdsp6Compat , 1, 0, "hexagon-qdsp6 backward compatibility")
 LANGOPT(ObjCAutoRefCount , 1, 0, "Objective-C automated reference counting")
@@ -301,8 +303,6 @@ BENIGN_LANGOPT(ConstexprStepLimit, 32, 1048576,
                "maximum constexpr evaluation steps")
 BENIGN_LANGOPT(EnableNewConstInterp, 1, 0,
                "enable the experimental new constant interpreter")
-BENIGN_LANGOPT(ForceNewConstInterp, 1, 0,
-               "force the use of the experimental new constant interpreter")
 BENIGN_LANGOPT(BracketDepth, 32, 256,
                "maximum bracket nesting depth")
 BENIGN_LANGOPT(NumLargeByValueCopy, 32, 0,
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 76592df20ddba..e09c3881dc5dc 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -195,6 +195,34 @@ class LangOptions : public LangOptionsBase {
     FEA_On
   };
 
+  // Values of the following enumerations correspond to metadata arguments
+  // specified for constrained floating-point intrinsics:
+  // http://llvm.org/docs/LangRef.html#constrained-floating-point-intrinsics.
+
+  /// Possible rounding modes.
+  enum FPRoundingModeKind {
+    /// Rounding to nearest, corresponds to "round.tonearest".
+    FPR_ToNearest,
+    /// Rounding toward -Inf, corresponds to "round.downward".
+    FPR_Downward,
+    /// Rounding toward +Inf, corresponds to "round.upward".
+    FPR_Upward,
+    /// Rounding toward zero, corresponds to "round.towardzero".
+    FPR_TowardZero,
+    /// Is determined by runtime environment, corresponds to "round.dynamic".
+    FPR_Dynamic
+  };
+
+  /// Possible floating point exception behavior.
+  enum FPExceptionModeKind {
+    /// Assume that floating-point exceptions are masked.
+    FPE_Ignore,
+    /// Transformations do not cause new exceptions but may hide some.
+    FPE_MayTrap,
+    /// Strictly preserve the floating-point exception semantics.
+    FPE_Strict
+  };
+
   enum class LaxVectorConversionKind {
     /// Permit no implicit vector bitcasts.
     None,
diff --git a/clang/include/clang/Basic/TargetCXXABI.h b/clang/include/clang/Basic/TargetCXXABI.h
index b1be40272513b..1ab45d2ce9a1e 100644
--- a/clang/include/clang/Basic/TargetCXXABI.h
+++ b/clang/include/clang/Basic/TargetCXXABI.h
@@ -103,6 +103,12 @@ class TargetCXXABI {
     /// of these details is necessarily final yet.
     WebAssembly,
 
+    /// The Fuchsia ABI is a modified version of the Itanium ABI.
+    ///
+    /// The relevant changes from the Itanium ABI are:
+    ///   - constructors and destructors return 'this', as in ARM.
+    Fuchsia,
+
     /// The Microsoft ABI is the ABI used by Microsoft Visual Studio (and
     /// compatible compilers).
     ///
@@ -133,6 +139,7 @@ class TargetCXXABI {
   /// Does this ABI generally fall into the Itanium family of ABIs?
   bool isItaniumFamily() const {
     switch (getKind()) {
+    case Fuchsia:
     case GenericAArch64:
     case GenericItanium:
     case GenericARM:
@@ -152,6 +159,7 @@ class TargetCXXABI {
   /// Is this ABI an MSVC-compatible ABI?
   bool isMicrosoft() const {
     switch (getKind()) {
+    case Fuchsia:
     case GenericAArch64:
     case GenericItanium:
     case GenericARM:
@@ -182,6 +190,7 @@ class TargetCXXABI {
     case WebAssembly:
       // WebAssembly doesn't require any special alignment for member functions.
       return false;
+    case Fuchsia:
     case GenericARM:
     case GenericAArch64:
     case GenericMIPS:
@@ -257,6 +266,7 @@ class TargetCXXABI {
   /// done on a generic Itanium platform.
   bool canKeyFunctionBeInline() const {
     switch (getKind()) {
+    case Fuchsia:
     case GenericARM:
     case iOS64:
     case WebAssembly:
@@ -277,27 +287,18 @@ class TargetCXXABI {
   /// padding of a base class?
   ///
   /// This decision cannot be changed without breaking platform ABI
-  /// compatibility, and yet it is tied to language guarantees which
-  /// the committee has so far seen fit to strengthen no less than
-  /// three separate times:
-  ///   - originally, there were no restrictions at all;
-  ///   - C++98 declared that objects could not be allocated in the
-  ///     tail padding of a POD type;
-  ///   - C++03 extended the definition of POD to include classes
-  ///     containing member pointers; and
-  ///   - C++11 greatly broadened the definition of POD to include
-  ///     all trivial standard-layout classes.
-  /// Each of these changes technically took several existing
-  /// platforms and made them permanently non-conformant.
+  /// compatibility. In ISO C++98, tail padding reuse was only permitted for
+  /// non-POD base classes, but that restriction was removed retroactively by
+  /// DR 43, and tail padding reuse is always permitted in all de facto C++
+  /// language modes. However, many platforms use a variant of the old C++98
+  /// rule for compatibility.
   enum TailPaddingUseRules {
     /// The tail-padding of a base class is always theoretically
-    /// available, even if it's POD.  This is not strictly conforming
-    /// in any language mode.
+    /// available, even if it's POD.
     AlwaysUseTailPadding,
 
     /// Only allocate objects in the tail padding of a base class if
     /// the base class is not POD according to the rules of C++ TR1.
-    /// This is non-strictly conforming in C++11 mode.
     UseTailPaddingUnlessPOD03,
 
     /// Only allocate objects in the tail padding of a base class if
@@ -318,6 +319,7 @@ class TargetCXXABI {
 
     // iOS on ARM64 and WebAssembly use the C++11 POD rules.  They do not honor
     // the Itanium exception about classes with over-large bitfields.
+    case Fuchsia:
     case iOS64:
     case WebAssembly:
     case WatchOS:
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index cc83f4c34c145..33cecdadc686c 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -938,12 +938,6 @@ class TargetInfo : public virtual TransferrableTargetInfo,
     return true;
   }
 
-  /// Check if the register is reserved globally
-  ///
-  /// This function returns true if the register passed in RegName is reserved
-  /// using the corresponding -ffixed-RegName option.
-  virtual bool isRegisterReservedGlobally(StringRef) const { return true; }
-
   // validateOutputConstraint, validateInputConstraint - Checks that
   // a constraint is valid and provides information about it.
   // FIXME: These should return a real error instead of just true/false.
diff --git a/clang/include/clang/Basic/arm_fp16.td b/clang/include/clang/Basic/arm_fp16.td
index bb9873efac853..79cd16233c104 100644
--- a/clang/include/clang/Basic/arm_fp16.td
+++ b/clang/include/clang/Basic/arm_fp16.td
@@ -17,118 +17,118 @@ include "arm_neon_incl.td"
 let ArchGuard = "defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__)" in {
 
   // Negate
-  def VNEGSH          : SInst<"vneg", "ss", "Sh">;
+  def VNEGSH          : SInst<"vneg", "11", "Sh">;
 
   // Reciprocal/Sqrt
-  def SCALAR_FRECPSH  : IInst<"vrecps", "sss", "Sh">;
-  def FSQRTSH         : SInst<"vsqrt", "ss", "Sh">;
-  def SCALAR_FRSQRTSH : IInst<"vrsqrts", "sss", "Sh">;
+  def SCALAR_FRECPSH  : IInst<"vrecps", "111", "Sh">;
+  def FSQRTSH         : SInst<"vsqrt", "11", "Sh">;
+  def SCALAR_FRSQRTSH : IInst<"vrsqrts", "111", "Sh">;
 
   // Reciprocal Estimate
-  def SCALAR_FRECPEH  : IInst<"vrecpe", "ss", "Sh">;
+  def SCALAR_FRECPEH  : IInst<"vrecpe", "11", "Sh">;
 
   // Reciprocal Exponent
-  def SCALAR_FRECPXH  : IInst<"vrecpx", "ss", "Sh">;
+  def SCALAR_FRECPXH  : IInst<"vrecpx", "11", "Sh">;
 
   // Reciprocal Square Root Estimate
-  def SCALAR_FRSQRTEH : IInst<"vrsqrte", "ss", "Sh">;
+  def SCALAR_FRSQRTEH : IInst<"vrsqrte", "11", "Sh">;
 
   // Rounding
-  def FRINTZ_S64H     : SInst<"vrnd", "ss", "Sh">;
-  def FRINTA_S64H     : SInst<"vrnda", "ss", "Sh">;
-  def FRINTI_S64H     : SInst<"vrndi", "ss", "Sh">;
-  def FRINTM_S64H     : SInst<"vrndm", "ss", "Sh">;
-  def FRINTN_S64H     : SInst<"vrndn", "ss", "Sh">;
-  def FRINTP_S64H     : SInst<"vrndp", "ss", "Sh">;
-  def FRINTX_S64H     : SInst<"vrndx", "ss", "Sh">;
+  def FRINTZ_S64H     : SInst<"vrnd", "11", "Sh">;
+  def FRINTA_S64H     : SInst<"vrnda", "11", "Sh">;
+  def FRINTI_S64H     : SInst<"vrndi", "11", "Sh">;
+  def FRINTM_S64H     : SInst<"vrndm", "11", "Sh">;
+  def FRINTN_S64H     : SInst<"vrndn", "11", "Sh">;
+  def FRINTP_S64H     : SInst<"vrndp", "11", "Sh">;
+  def FRINTX_S64H     : SInst<"vrndx", "11", "Sh">;
 
   // Conversion
-  def SCALAR_SCVTFSH  : SInst<"vcvth_f16", "Ys", "sUs">;
-  def SCALAR_SCVTFSH1 : SInst<"vcvth_f16", "Ys", "iUi">;
-  def SCALAR_SCVTFSH2 : SInst<"vcvth_f16", "Ys", "lUl">;
-  def SCALAR_FCVTZSH  : SInst<"vcvt_s16", "$s", "Sh">;
-  def SCALAR_FCVTZSH1 : SInst<"vcvt_s32", "Is", "Sh">;
-  def SCALAR_FCVTZSH2 : SInst<"vcvt_s64", "Ls", "Sh">;
-  def SCALAR_FCVTZUH  : SInst<"vcvt_u16", "bs", "Sh">;
-  def SCALAR_FCVTZUH1 : SInst<"vcvt_u32", "Us", "Sh">;
-  def SCALAR_FCVTZUH2 : SInst<"vcvt_u64", "Os", "Sh">;
-  def SCALAR_FCVTASH  : SInst<"vcvta_s16", "$s", "Sh">;
-  def SCALAR_FCVTASH1 : SInst<"vcvta_s32", "Is", "Sh">;
-  def SCALAR_FCVTASH2 : SInst<"vcvta_s64", "Ls", "Sh">;
-  def SCALAR_FCVTAUH  : SInst<"vcvta_u16", "bs", "Sh">;
-  def SCALAR_FCVTAUH1 : SInst<"vcvta_u32", "Us", "Sh">;
-  def SCALAR_FCVTAUH2 : SInst<"vcvta_u64", "Os", "Sh">;
-  def SCALAR_FCVTMSH  : SInst<"vcvtm_s16", "$s", "Sh">;
-  def SCALAR_FCVTMSH1 : SInst<"vcvtm_s32", "Is", "Sh">;
-  def SCALAR_FCVTMSH2 : SInst<"vcvtm_s64", "Ls", "Sh">;
-  def SCALAR_FCVTMUH  : SInst<"vcvtm_u16", "bs", "Sh">;
-  def SCALAR_FCVTMUH1 : SInst<"vcvtm_u32", "Us", "Sh">;
-  def SCALAR_FCVTMUH2 : SInst<"vcvtm_u64", "Os", "Sh">;
-  def SCALAR_FCVTNSH  : SInst<"vcvtn_s16", "$s", "Sh">;
-  def SCALAR_FCVTNSH1 : SInst<"vcvtn_s32", "Is", "Sh">;
-  def SCALAR_FCVTNSH2 : SInst<"vcvtn_s64", "Ls", "Sh">;
-  def SCALAR_FCVTNUH  : SInst<"vcvtn_u16", "bs", "Sh">;
-  def SCALAR_FCVTNUH1 : SInst<"vcvtn_u32", "Us", "Sh">;
-  def SCALAR_FCVTNUH2 : SInst<"vcvtn_u64", "Os", "Sh">;
-  def SCALAR_FCVTPSH  : SInst<"vcvtp_s16", "$s", "Sh">;
-  def SCALAR_FCVTPSH1 : SInst<"vcvtp_s32", "Is", "Sh">;
-  def SCALAR_FCVTPSH2 : SInst<"vcvtp_s64", "Ls", "Sh">;
-  def SCALAR_FCVTPUH  : SInst<"vcvtp_u16", "bs", "Sh">;
-  def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "Us", "Sh">;
-  def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "Os", "Sh">;
+  def SCALAR_SCVTFSH  : SInst<"vcvth_f16", "(1F)(1!)", "sUs">;
+  def SCALAR_SCVTFSH1 : SInst<"vcvth_f16", "(1F<)(1!)", "iUi">;
+  def SCALAR_SCVTFSH2 : SInst<"vcvth_f16", "(1F<<)(1!)", "lUl">;
+  def SCALAR_FCVTZSH  : SInst<"vcvt_s16", "(1S)1", "Sh">;
+  def SCALAR_FCVTZSH1 : SInst<"vcvt_s32", "(1S>)1", "Sh">;
+  def SCALAR_FCVTZSH2 : SInst<"vcvt_s64", "(1S>>)1", "Sh">;
+  def SCALAR_FCVTZUH  : SInst<"vcvt_u16", "(1U)1", "Sh">;
+  def SCALAR_FCVTZUH1 : SInst<"vcvt_u32", "(1U>)1", "Sh">;
+  def SCALAR_FCVTZUH2 : SInst<"vcvt_u64", "(1U>>)1", "Sh">;
+  def SCALAR_FCVTASH  : SInst<"vcvta_s16", "(1S)1", "Sh">;
+  def SCALAR_FCVTASH1 : SInst<"vcvta_s32", "(1S>)1", "Sh">;
+  def SCALAR_FCVTASH2 : SInst<"vcvta_s64", "(1S>>)1", "Sh">;
+  def SCALAR_FCVTAUH  : SInst<"vcvta_u16", "(1U)1", "Sh">;
+  def SCALAR_FCVTAUH1 : SInst<"vcvta_u32", "(1U>)1", "Sh">;
+  def SCALAR_FCVTAUH2 : SInst<"vcvta_u64", "(1U>>)1", "Sh">;
+  def SCALAR_FCVTMSH  : SInst<"vcvtm_s16", "(1S)1", "Sh">;
+  def SCALAR_FCVTMSH1 : SInst<"vcvtm_s32", "(1S>)1", "Sh">;
+  def SCALAR_FCVTMSH2 : SInst<"vcvtm_s64", "(1S>>)1", "Sh">;
+  def SCALAR_FCVTMUH  : SInst<"vcvtm_u16", "(1U)1", "Sh">;
+  def SCALAR_FCVTMUH1 : SInst<"vcvtm_u32", "(1U>)1", "Sh">;
+  def SCALAR_FCVTMUH2 : SInst<"vcvtm_u64", "(1U>>)1", "Sh">;
+  def SCALAR_FCVTNSH  : SInst<"vcvtn_s16", "(1S)1", "Sh">;
+  def SCALAR_FCVTNSH1 : SInst<"vcvtn_s32", "(1S>)1", "Sh">;
+  def SCALAR_FCVTNSH2 : SInst<"vcvtn_s64", "(1S>>)1", "Sh">;
+  def SCALAR_FCVTNUH  : SInst<"vcvtn_u16", "(1U)1", "Sh">;
+  def SCALAR_FCVTNUH1 : SInst<"vcvtn_u32", "(1U>)1", "Sh">;
+  def SCALAR_FCVTNUH2 : SInst<"vcvtn_u64", "(1U>>)1", "Sh">;
+  def SCALAR_FCVTPSH  : SInst<"vcvtp_s16", "(1S)1", "Sh">;
+  def SCALAR_FCVTPSH1 : SInst<"vcvtp_s32", "(1S>)1", "Sh">;
+  def SCALAR_FCVTPSH2 : SInst<"vcvtp_s64", "(1S>>)1", "Sh">;
+  def SCALAR_FCVTPUH  : SInst<"vcvtp_u16", "(1U)1", "Sh">;
+  def SCALAR_FCVTPUH1 : SInst<"vcvtp_u32", "(1U>)1", "Sh">;
+  def SCALAR_FCVTPUH2 : SInst<"vcvtp_u64", "(1U>>)1", "Sh">;
   let isVCVT_N = 1 in {
-    def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "Ysi", "sUs">;
-    def SCALAR_SCVTFSH1O: SInst<"vcvth_n_f16", "Ysi", "iUi">;
-    def SCALAR_SCVTFSH2O: SInst<"vcvth_n_f16", "Ysi", "lUl">;
-    def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "$si", "Sh">;
-    def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "Isi", "Sh">;
-    def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "Lsi", "Sh">;
-    def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "bsi", "Sh">;
-    def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "Usi", "Sh">;
-    def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "Osi", "Sh">;
+    def SCALAR_SCVTFSHO : SInst<"vcvth_n_f16", "(1F)(1!)I", "sUs">;
+    def SCALAR_SCVTFSH1O: SInst<"vcvth_n_f16", "(1F<)(1!)I", "iUi">;
+    def SCALAR_SCVTFSH2O: SInst<"vcvth_n_f16", "(1F<<)(1!)I", "lUl">;
+    def SCALAR_FCVTZSHO : SInst<"vcvt_n_s16", "(1S)1I", "Sh">;
+    def SCALAR_FCVTZSH1O: SInst<"vcvt_n_s32", "(1S>)1I", "Sh">;
+    def SCALAR_FCVTZSH2O: SInst<"vcvt_n_s64", "(1S>>)1I", "Sh">;
+    def SCALAR_FCVTZUHO : SInst<"vcvt_n_u16", "(1U)1I", "Sh">;
+    def SCALAR_FCVTZUH1O: SInst<"vcvt_n_u32", "(1U>)1I", "Sh">;
+    def SCALAR_FCVTZUH2O: SInst<"vcvt_n_u64", "(1U>>)1I", "Sh">;
   }
   // Comparison
-  def SCALAR_CMEQRH   : SInst<"vceq", "bss", "Sh">;
-  def SCALAR_CMEQZH   : SInst<"vceqz", "bs", "Sh">;
-  def SCALAR_CMGERH   : SInst<"vcge", "bss", "Sh">;
-  def SCALAR_CMGEZH   : SInst<"vcgez", "bs", "Sh">;
-  def SCALAR_CMGTRH   : SInst<"vcgt", "bss", "Sh">;
-  def SCALAR_CMGTZH   : SInst<"vcgtz", "bs", "Sh">;
-  def SCALAR_CMLERH   : SInst<"vcle", "bss", "Sh">;
-  def SCALAR_CMLEZH   : SInst<"vclez", "bs", "Sh">;
-  def SCALAR_CMLTH    : SInst<"vclt", "bss", "Sh">;
-  def SCALAR_CMLTZH   : SInst<"vcltz", "bs", "Sh">;
+  def SCALAR_CMEQRH   : SInst<"vceq", "(1U)11", "Sh">;
+  def SCALAR_CMEQZH   : SInst<"vceqz", "(1U)1", "Sh">;
+  def SCALAR_CMGERH   : SInst<"vcge", "(1U)11", "Sh">;
+  def SCALAR_CMGEZH   : SInst<"vcgez", "(1U)1", "Sh">;
+  def SCALAR_CMGTRH   : SInst<"vcgt", "(1U)11", "Sh">;
+  def SCALAR_CMGTZH   : SInst<"vcgtz", "(1U)1", "Sh">;
+  def SCALAR_CMLERH   : SInst<"vcle", "(1U)11", "Sh">;
+  def SCALAR_CMLEZH   : SInst<"vclez", "(1U)1", "Sh">;
+  def SCALAR_CMLTH    : SInst<"vclt", "(1U)11", "Sh">;
+  def SCALAR_CMLTZH   : SInst<"vcltz", "(1U)1", "Sh">;
 
   // Absolute Compare Mask Greater Than Or Equal
-  def SCALAR_FACGEH   : IInst<"vcage", "bss", "Sh">;
-  def SCALAR_FACLEH   : IInst<"vcale", "bss", "Sh">;
+  def SCALAR_FACGEH   : IInst<"vcage", "(1U)11", "Sh">;
+  def SCALAR_FACLEH   : IInst<"vcale", "(1U)11", "Sh">;
 
   // Absolute Compare Mask Greater Than
-  def SCALAR_FACGT    : IInst<"vcagt", "bss", "Sh">;
-  def SCALAR_FACLT    : IInst<"vcalt", "bss", "Sh">;
+  def SCALAR_FACGT    : IInst<"vcagt", "(1U)11", "Sh">;
+  def SCALAR_FACLT    : IInst<"vcalt", "(1U)11", "Sh">;
 
   // Scalar Absolute Value
-  def SCALAR_ABSH     : SInst<"vabs", "ss", "Sh">;
+  def SCALAR_ABSH     : SInst<"vabs", "11", "Sh">;
 
   // Scalar Absolute Difference
-  def SCALAR_ABDH: IInst<"vabd", "sss", "Sh">;
+  def SCALAR_ABDH: IInst<"vabd", "111", "Sh">;
 
   // Add/Sub
-  def VADDSH          : SInst<"vadd", "sss", "Sh">;
-  def VSUBHS          : SInst<"vsub", "sss", "Sh">;
+  def VADDSH          : SInst<"vadd", "111", "Sh">;
+  def VSUBHS          : SInst<"vsub", "111", "Sh">;
 
   // Max/Min
-  def VMAXHS          : SInst<"vmax", "sss", "Sh">;
-  def VMINHS          : SInst<"vmin", "sss", "Sh">;
-  def FMAXNMHS        : SInst<"vmaxnm", "sss", "Sh">;
-  def FMINNMHS        : SInst<"vminnm", "sss", "Sh">;
+  def VMAXHS          : SInst<"vmax", "111", "Sh">;
+  def VMINHS          : SInst<"vmin", "111", "Sh">;
+  def FMAXNMHS        : SInst<"vmaxnm", "111", "Sh">;
+  def FMINNMHS        : SInst<"vminnm", "111", "Sh">;
 
   // Multiplication/Division
-  def VMULHS          : SInst<"vmul", "sss", "Sh">;
-  def MULXHS          : SInst<"vmulx", "sss", "Sh">;
-  def FDIVHS          : SInst<"vdiv", "sss",  "Sh">;
+  def VMULHS          : SInst<"vmul", "111", "Sh">;
+  def MULXHS          : SInst<"vmulx", "111", "Sh">;
+  def FDIVHS          : SInst<"vdiv", "111",  "Sh">;
 
   // Vector fused multiply-add operations
-  def VFMAHS          : SInst<"vfma", "ssss", "Sh">;
-  def VFMSHS          : SInst<"vfms", "ssss", "Sh">;
+  def VFMAHS          : SInst<"vfma", "1111", "Sh">;
+  def VFMSHS          : SInst<"vfms", "1111", "Sh">;
 }
diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index d8d199f464d93..5fa9fc008202b 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -28,25 +28,111 @@ foreach n = [ 2, 4 ] in {
                               "Intrinsic::arm_mve_vld"#n#"q":$IRIntr)>;
 }
 
+multiclass bit_op_fp<IRBuilder bitop> {
+def "": Intrinsic<Vector, (args Vector:$a, Vector:$b),
+   (bitcast (bitop (bitcast $a, UVector), (bitcast $b, UVector)), Vector)>;
+}
+
+multiclass bit_op_fp_with_inv<IRBuilder bitop> {
+def "": Intrinsic<Vector, (args Vector:$a, Vector:$b),
+   (bitcast (bitop (bitcast $a, UVector), (not (bitcast $b, UVector))), Vector)>;
+}
+
 let params = T.Int in {
 def vaddq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (add $a, $b)>;
+def vandq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, $b)>;
+def vbicq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (and $a, (not $b))>;
+def veorq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (xor $a, $b)>;
+def vornq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, (not $b))>;
+def vorrq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (or $a, $b)>;
 def vsubq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (sub $a, $b)>;
+def vmulq: Intrinsic<Vector, (args Vector:$a, Vector:$b), (mul $a, $b)>;
+def vmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                             (IRInt<"vmulh", [Vector]> $a, $b)>;
+def vrmulhq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                             (IRInt<"vrmulh", [Vector]> $a, $b)>;
 }
 
 let params = T.Float in {
 def vaddqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fadd $a, $b)>,
             NameOverride<"vaddq">;
+defm vandqf: bit_op_fp<and>, NameOverride<"vandq">;
+defm vbicqf: bit_op_fp_with_inv<and>, NameOverride<"vbicq">;
+defm veorqf: bit_op_fp<xor>, NameOverride<"veorq">;
+defm vornqf: bit_op_fp_with_inv<or>, NameOverride<"vornq">;
+defm vorrqf: bit_op_fp<or>, NameOverride<"vorrq">;
 def vsubqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fsub $a, $b)>,
             NameOverride<"vsubq">;
+def vmulqf: Intrinsic<Vector, (args Vector:$a, Vector:$b), (fmul $a, $b)>,
+            NameOverride<"vmulq">;
+}
+
+// The bitcasting below is not overcomplicating the IR because while
+// Vector and UVector may be different vector types at the C level i.e.
+// vectors of same size signed/unsigned ints. Once they're lowered
+// to IR, they are just bit vectors with no sign at all, so the
+// bitcasts will be automatically elided by IRBuilder.
+multiclass predicated_bit_op_fp<string int_op> {
+def "": Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
+                                Predicate:$pred),
+    (bitcast (IRInt<int_op, [UVector, Predicate]>
+                    (bitcast $a, UVector),
+                    (bitcast $b, UVector),
+                    $pred,
+                    (bitcast $inactive, UVector)), Vector)>;
+}
+
+// Plain intrinsics
+let params = T.Usual in {
+def vabdq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                             (IRInt<"vabd", [Vector]> $a, $b)>;
 }
 
+// Predicated intrinsics
 let params = T.Usual in {
+def vabdq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"abd_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
 def vaddq_m: Intrinsic<
     Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
     (IRInt<"add_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
 def vsubq_m: Intrinsic<
     Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
     (IRInt<"sub_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+def vmulq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"mul_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+defm vandq_m: predicated_bit_op_fp<"and_predicated">;
+defm vbicq_m: predicated_bit_op_fp<"bic_predicated">;
+defm veorq_m: predicated_bit_op_fp<"eor_predicated">;
+defm vornq_m: predicated_bit_op_fp<"orn_predicated">;
+defm vorrq_m: predicated_bit_op_fp<"orr_predicated">;
+}
+
+// Predicated intrinsics - Int types only
+let params = T.Int in {
+def vminq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"min_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+def vmaxq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"max_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+def vmulhq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"mulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+def vrmulhq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"rmulh_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+}
+
+// Predicated intrinsics - Float types only
+let params = T.Float in {
+def vminnmq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"min_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
+def vmaxnmq_m: Intrinsic<
+    Vector, (args Vector:$inactive, Vector:$a, Vector:$b, Predicate:$pred),
+    (IRInt<"max_predicated", [Vector, Predicate]> $a, $b, $pred, $inactive)>;
 }
 
 let params = T.Int in {
@@ -117,6 +203,54 @@ let params = T.Float in {
   defm: compare<"le", fcmp_le>;
 }
 
+let params = T.Signed in {
+  def vminq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                               (select (icmp_sle $a, $b), $a, $b)>;
+  def vmaxq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                               (select (icmp_sge $a, $b), $a, $b)>;
+}
+let params = T.Unsigned in {
+  def vminqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                                (select (icmp_ule $a, $b), $a, $b)>,
+              NameOverride<"vminq">;
+  def vmaxqu: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                                (select (icmp_uge $a, $b), $a, $b)>,
+              NameOverride<"vmaxq">;
+}
+let params = T.Float in {
+  def vminnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                               (IRIntBase<"minnum", [Vector]> $a, $b)>;
+  def vmaxnmq: Intrinsic<Vector, (args Vector:$a, Vector:$b),
+                               (IRIntBase<"maxnum", [Vector]> $a, $b)>;
+}
+
+def vpselq: Intrinsic<Vector, (args Vector:$t, Vector:$f, Predicate:$pred),
+                      (select $pred, $t, $f)> { let params = T.Usual; }
+def vpselq_64: Intrinsic<
+    Vector, (args Vector:$t, Vector:$f, PredOf<u32>:$pred),
+            (bitcast (select $pred, (bitcast $t, VecOf<u32>),
+                                    (bitcast $f, VecOf<u32>)), Vector)>,
+    NameOverride<"vpselq"> { let params = T.All64; }
+
+let params = [Void], pnt = PNT_None in {
+
+  multiclass vctp<Type pred, string intname> {
+    def "": Intrinsic<pred, (args u32:$val),
+        (u16 (IRInt<"pred_v2i", [pred]> (IRIntBase<intname> $val)))>;
+    def _m: Intrinsic<pred, (args u32:$val, pred:$inpred),
+        (u16 (IRInt<"pred_v2i", [pred]> (and $inpred,
+                                         (IRIntBase<intname> $val))))>;
+  }
+  defm vctp8q:  vctp<PredOf<u8>,  "arm_mve_vctp8">;
+  defm vctp16q: vctp<PredOf<u16>, "arm_mve_vctp16">;
+  defm vctp32q: vctp<PredOf<u32>, "arm_mve_vctp32">;
+  defm vctp64q: vctp<PredOf<u64>, "arm_mve_vctp64">;
+
+  def vpnot: Intrinsic<PredOf<u8>, (args unpromoted<PredOf<u8>>:$pred),
+                       (xor $pred, (u16 65535))>;
+
+}
+
 multiclass contiguous_load<string mnemonic, PrimitiveType memtype,
                            list<Type> same_size, list<Type> wider> {
   // Intrinsics named with explicit memory and element sizes that match:
diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td
index 27cdada02ec4f..d837a1d33d000 100644
--- a/clang/include/clang/Basic/arm_mve_defs.td
+++ b/clang/include/clang/Basic/arm_mve_defs.td
@@ -58,12 +58,16 @@ class CGHelperFn<string func> : IRBuilderBase {
   let prefix = func # "(Builder, ";
 }
 def add: IRBuilder<"CreateAdd">;
+def mul: IRBuilder<"CreateMul">;
+def not: IRBuilder<"CreateNot">;
 def or: IRBuilder<"CreateOr">;
 def and: IRBuilder<"CreateAnd">;
+def xor: IRBuilder<"CreateXor">;
 def sub: IRBuilder<"CreateSub">;
 def shl: IRBuilder<"CreateShl">;
 def lshr: IRBuilder<"CreateLShr">;
 def fadd: IRBuilder<"CreateFAdd">;
+def fmul: IRBuilder<"CreateFMul">;
 def fsub: IRBuilder<"CreateFSub">;
 def load: IRBuilder<"CreateLoad"> {
   let special_params = [IRBuilderAddrParam<0>];
@@ -103,6 +107,7 @@ def fcmp_ge: IRBuilder<"CreateFCmpOGE">;
 def fcmp_lt: IRBuilder<"CreateFCmpOLT">;
 def fcmp_le: IRBuilder<"CreateFCmpOLE">;
 def splat: CGHelperFn<"ARMMVEVectorSplat">;
+def select: IRBuilder<"CreateSelect">;
 
 // A node that makes an Address out of a pointer-typed Value, by
 // providing an alignment as the second argument.
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 127c5af97ce67..a4dc21b643110 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -109,7 +109,8 @@ def OP_OR       : Op<(op "|", $p0, $p1)>;
 def OP_XOR      : Op<(op "^", $p0, $p1)>;
 def OP_ANDN     : Op<(op "&", $p0, (op "~", $p1))>;
 def OP_ORN      : Op<(op "|", $p0, (op "~", $p1))>;
-def OP_CAST     : Op<(cast "R", $p0)>;
+def OP_CAST     : LOp<[(save_temp $promote, $p0),
+                       (cast "R", $promote)]>;
 def OP_HI       : Op<(shuffle $p0, $p0, (highhalf mask0))>;
 def OP_LO       : Op<(shuffle $p0, $p0, (lowhalf mask0))>;
 def OP_CONC     : Op<(shuffle $p0, $p1, (add mask0, mask1))>;
@@ -226,240 +227,240 @@ def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.1 Addition
-def VADD    : IOpInst<"vadd", "ddd",
+def VADD    : IOpInst<"vadd", "...",
                       "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_ADD>;
-def VADDL   : SOpInst<"vaddl", "wdd", "csiUcUsUi", OP_ADDL>;
-def VADDW   : SOpInst<"vaddw", "wwd", "csiUcUsUi", OP_ADDW>;
-def VHADD   : SInst<"vhadd", "ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VRHADD  : SInst<"vrhadd", "ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VQADD   : SInst<"vqadd", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VADDHN  : IInst<"vaddhn", "hkk", "silUsUiUl">;
-def VRADDHN : IInst<"vraddhn", "hkk", "silUsUiUl">;
+def VADDL   : SOpInst<"vaddl", "(>Q)..", "csiUcUsUi", OP_ADDL>;
+def VADDW   : SOpInst<"vaddw", "(>Q)(>Q).", "csiUcUsUi", OP_ADDW>;
+def VHADD   : SInst<"vhadd", "...", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VRHADD  : SInst<"vrhadd", "...", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VQADD   : SInst<"vqadd", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VADDHN  : IInst<"vaddhn", "<QQ", "silUsUiUl">;
+def VRADDHN : IInst<"vraddhn", "<QQ", "silUsUiUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.2 Multiplication
-def VMUL     : IOpInst<"vmul", "ddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MUL>;
-def VMULP    : SInst<"vmul", "ddd", "PcQPc">;
-def VMLA     : IOpInst<"vmla", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLA>;
-def VMLAL    : SOpInst<"vmlal", "wwdd", "csiUcUsUi", OP_MLAL>;
-def VMLS     : IOpInst<"vmls", "dddd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLS>;
-def VMLSL    : SOpInst<"vmlsl", "wwdd", "csiUcUsUi", OP_MLSL>;
-def VQDMULH  : SInst<"vqdmulh", "ddd", "siQsQi">;
-def VQRDMULH : SInst<"vqrdmulh", "ddd", "siQsQi">;
+def VMUL     : IOpInst<"vmul", "...", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MUL>;
+def VMULP    : SInst<"vmul", "...", "PcQPc">;
+def VMLA     : IOpInst<"vmla", "....", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLA>;
+def VMLAL    : SOpInst<"vmlal", "(>Q)(>Q)..", "csiUcUsUi", OP_MLAL>;
+def VMLS     : IOpInst<"vmls", "....", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_MLS>;
+def VMLSL    : SOpInst<"vmlsl", "(>Q)(>Q)..", "csiUcUsUi", OP_MLSL>;
+def VQDMULH  : SInst<"vqdmulh", "...", "siQsQi">;
+def VQRDMULH : SInst<"vqrdmulh", "...", "siQsQi">;
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
-def VQRDMLAH : SOpInst<"vqrdmlah", "dddd", "siQsQi", OP_QRDMLAH>;
-def VQRDMLSH : SOpInst<"vqrdmlsh", "dddd", "siQsQi", OP_QRDMLSH>;
+def VQRDMLAH : SOpInst<"vqrdmlah", "....", "siQsQi", OP_QRDMLAH>;
+def VQRDMLSH : SOpInst<"vqrdmlsh", "....", "siQsQi", OP_QRDMLSH>;
 }
 
-def VQDMLAL  : SInst<"vqdmlal", "wwdd", "si">;
-def VQDMLSL  : SInst<"vqdmlsl", "wwdd", "si">;
-def VMULL    : SInst<"vmull", "wdd", "csiUcUsUiPc">;
-def VQDMULL  : SInst<"vqdmull", "wdd", "si">;
+def VQDMLAL  : SInst<"vqdmlal", "(>Q)(>Q)..", "si">;
+def VQDMLSL  : SInst<"vqdmlsl", "(>Q)(>Q)..", "si">;
+def VMULL    : SInst<"vmull", "(>Q)..", "csiUcUsUiPc">;
+def VQDMULL  : SInst<"vqdmull", "(>Q)..", "si">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.3 Subtraction
-def VSUB    : IOpInst<"vsub", "ddd",
+def VSUB    : IOpInst<"vsub", "...",
                       "csilfUcUsUiUlQcQsQiQlQfQUcQUsQUiQUl", OP_SUB>;
-def VSUBL   : SOpInst<"vsubl", "wdd", "csiUcUsUi", OP_SUBL>;
-def VSUBW   : SOpInst<"vsubw", "wwd", "csiUcUsUi", OP_SUBW>;
-def VQSUB   : SInst<"vqsub", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VHSUB   : SInst<"vhsub", "ddd", "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VSUBHN  : IInst<"vsubhn", "hkk", "silUsUiUl">;
-def VRSUBHN : IInst<"vrsubhn", "hkk", "silUsUiUl">;
+def VSUBL   : SOpInst<"vsubl", "(>Q)..", "csiUcUsUi", OP_SUBL>;
+def VSUBW   : SOpInst<"vsubw", "(>Q)(>Q).", "csiUcUsUi", OP_SUBW>;
+def VQSUB   : SInst<"vqsub", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VHSUB   : SInst<"vhsub", "...", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VSUBHN  : IInst<"vsubhn", "<QQ", "silUsUiUl">;
+def VRSUBHN : IInst<"vrsubhn", "<QQ", "silUsUiUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.4 Comparison
-def VCEQ  : IOpInst<"vceq", "udd", "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_EQ>;
-def VCGE  : SOpInst<"vcge", "udd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GE>;
+def VCEQ  : IOpInst<"vceq", "U..", "csifUcUsUiPcQcQsQiQfQUcQUsQUiQPc", OP_EQ>;
+def VCGE  : SOpInst<"vcge", "U..", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GE>;
 let InstName = "vcge" in
-def VCLE  : SOpInst<"vcle", "udd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LE>;
-def VCGT  : SOpInst<"vcgt", "udd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GT>;
+def VCLE  : SOpInst<"vcle", "U..", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LE>;
+def VCGT  : SOpInst<"vcgt", "U..", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_GT>;
 let InstName = "vcgt" in
-def VCLT  : SOpInst<"vclt", "udd", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LT>;
+def VCLT  : SOpInst<"vclt", "U..", "csifUcUsUiQcQsQiQfQUcQUsQUi", OP_LT>;
 let InstName = "vacge" in {
-def VCAGE : IInst<"vcage", "udd", "fQf">;
-def VCALE : IInst<"vcale", "udd", "fQf">;
+def VCAGE : IInst<"vcage", "U..", "fQf">;
+def VCALE : IInst<"vcale", "U..", "fQf">;
 }
 let InstName = "vacgt" in {
-def VCAGT : IInst<"vcagt", "udd", "fQf">;
-def VCALT : IInst<"vcalt", "udd", "fQf">;
+def VCAGT : IInst<"vcagt", "U..", "fQf">;
+def VCALT : IInst<"vcalt", "U..", "fQf">;
 }
-def VTST  : WInst<"vtst", "udd", "csiUcUsUiPcPsQcQsQiQUcQUsQUiQPcQPs">;
+def VTST  : WInst<"vtst", "U..", "csiUcUsUiPcPsQcQsQiQUcQUsQUiQPcQPs">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.5 Absolute Difference
-def VABD  : SInst<"vabd", "ddd",  "csiUcUsUifQcQsQiQUcQUsQUiQf">;
-def VABDL : SOpInst<"vabdl", "wdd",  "csiUcUsUi", OP_ABDL>;
-def VABA  : SOpInst<"vaba", "dddd", "csiUcUsUiQcQsQiQUcQUsQUi", OP_ABA>;
-def VABAL : SOpInst<"vabal", "wwdd", "csiUcUsUi", OP_ABAL>;
+def VABD  : SInst<"vabd", "...",  "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+def VABDL : SOpInst<"vabdl", "(>Q)..",  "csiUcUsUi", OP_ABDL>;
+def VABA  : SOpInst<"vaba", "....", "csiUcUsUiQcQsQiQUcQUsQUi", OP_ABA>;
+def VABAL : SOpInst<"vabal", "(>Q)(>Q)..", "csiUcUsUi", OP_ABAL>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.6 Max/Min
-def VMAX : SInst<"vmax", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
-def VMIN : SInst<"vmin", "ddd", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+def VMAX : SInst<"vmax", "...", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
+def VMIN : SInst<"vmin", "...", "csiUcUsUifQcQsQiQUcQUsQUiQf">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.7 Pairwise Addition
-def VPADD  : IInst<"vpadd", "ddd", "csiUcUsUif">;
-def VPADDL : SInst<"vpaddl", "nd",  "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VPADAL : SInst<"vpadal", "nnd", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VPADD  : IInst<"vpadd", "...", "csiUcUsUif">;
+def VPADDL : SInst<"vpaddl", ">.",  "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VPADAL : SInst<"vpadal", ">>.", "csiUcUsUiQcQsQiQUcQUsQUi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.8-9 Folding Max/Min
-def VPMAX : SInst<"vpmax", "ddd", "csiUcUsUif">;
-def VPMIN : SInst<"vpmin", "ddd", "csiUcUsUif">;
+def VPMAX : SInst<"vpmax", "...", "csiUcUsUif">;
+def VPMIN : SInst<"vpmin", "...", "csiUcUsUif">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.10 Reciprocal/Sqrt
-def VRECPS  : IInst<"vrecps", "ddd", "fQf">;
-def VRSQRTS : IInst<"vrsqrts", "ddd", "fQf">;
+def VRECPS  : IInst<"vrecps", "...", "fQf">;
+def VRSQRTS : IInst<"vrsqrts", "...", "fQf">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.11 Shifts by signed variable
-def VSHL   : SInst<"vshl", "ddx", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHL  : SInst<"vqshl", "ddx", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSHL  : SInst<"vrshl", "ddx", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQRSHL : SInst<"vqrshl", "ddx", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VSHL   : SInst<"vshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHL  : SInst<"vqshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSHL  : SInst<"vrshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQRSHL : SInst<"vqrshl", "..S", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.12 Shifts by constant
 let isShift = 1 in {
-def VSHR_N     : SInst<"vshr_n", "ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSHL_N     : IInst<"vshl_n", "ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSHR_N    : SInst<"vrshr_n", "ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VSRA_N     : SInst<"vsra_n", "dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VRSRA_N    : SInst<"vrsra_n", "dddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHL_N    : SInst<"vqshl_n", "ddi", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
-def VQSHLU_N   : SInst<"vqshlu_n", "udi", "csilQcQsQiQl">;
-def VSHRN_N    : IInst<"vshrn_n", "hki", "silUsUiUl">;
-def VQSHRUN_N  : SInst<"vqshrun_n", "eki", "sil">;
-def VQRSHRUN_N : SInst<"vqrshrun_n", "eki", "sil">;
-def VQSHRN_N   : SInst<"vqshrn_n", "hki", "silUsUiUl">;
-def VRSHRN_N   : IInst<"vrshrn_n", "hki", "silUsUiUl">;
-def VQRSHRN_N  : SInst<"vqrshrn_n", "hki", "silUsUiUl">;
-def VSHLL_N    : SInst<"vshll_n", "wdi", "csiUcUsUi">;
+def VSHR_N     : SInst<"vshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VSHL_N     : IInst<"vshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSHR_N    : SInst<"vrshr_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VSRA_N     : SInst<"vsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VRSRA_N    : SInst<"vrsra_n", "...I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHL_N    : SInst<"vqshl_n", "..I", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl">;
+def VQSHLU_N   : SInst<"vqshlu_n", "U.I", "csilQcQsQiQl">;
+def VSHRN_N    : IInst<"vshrn_n", "<QI", "silUsUiUl">;
+def VQSHRUN_N  : SInst<"vqshrun_n", "(<U)QI", "sil">;
+def VQRSHRUN_N : SInst<"vqrshrun_n", "(<U)QI", "sil">;
+def VQSHRN_N   : SInst<"vqshrn_n", "<QI", "silUsUiUl">;
+def VRSHRN_N   : IInst<"vrshrn_n", "<QI", "silUsUiUl">;
+def VQRSHRN_N  : SInst<"vqrshrn_n", "<QI", "silUsUiUl">;
+def VSHLL_N    : SInst<"vshll_n", "(>Q).I", "csiUcUsUi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.13 Shifts with insert
-def VSRI_N : WInst<"vsri_n", "dddi",
+def VSRI_N : WInst<"vsri_n", "...I",
                    "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
-def VSLI_N : WInst<"vsli_n", "dddi",
+def VSLI_N : WInst<"vsli_n", "...I",
                    "csilUcUsUiUlPcPsQcQsQiQlQUcQUsQUiQUlQPcQPs">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.14 Loads and stores of a single vector
-def VLD1      : WInst<"vld1", "dc",
+def VLD1      : WInst<"vld1", ".(c*!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VLD1_X2   : WInst<"vld1_x2", "2c",
+def VLD1_X2   : WInst<"vld1_x2", "2(c*!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VLD1_X3   : WInst<"vld1_x3", "3c",
+def VLD1_X3   : WInst<"vld1_x3", "3(c*!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VLD1_X4   : WInst<"vld1_x4", "4c",
+def VLD1_X4   : WInst<"vld1_x4", "4(c*!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VLD1_LANE : WInst<"vld1_lane", "dcdi",
+def VLD1_LANE : WInst<"vld1_lane", ".(c*!).I",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VLD1_DUP  : WInst<"vld1_dup", "dc",
+def VLD1_DUP  : WInst<"vld1_dup", ".(c*!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST1      : WInst<"vst1", "vpd",
+def VST1      : WInst<"vst1", "v*(.!)",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST1_X2   : WInst<"vst1_x2", "vp2",
+def VST1_X2   : WInst<"vst1_x2", "v*(2!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VST1_X3   : WInst<"vst1_x3", "vp3",
+def VST1_X3   : WInst<"vst1_x3", "v*(3!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VST1_X4   : WInst<"vst1_x4", "vp4",
+def VST1_X4   : WInst<"vst1_x4", "v*(4!)",
                       "cfilsUcUiUlUsQcQfQiQlQsQUcQUiQUlQUsPcPsQPcQPs">;
-def VST1_LANE : WInst<"vst1_lane", "vpdi",
+def VST1_LANE : WInst<"vst1_lane", "v*(.!)I",
                       "QUcQUsQUiQUlQcQsQiQlQfQPcQPsUcUsUiUlcsilfPcPs">;
 let ArchGuard = "(__ARM_FP & 2)" in {
-def VLD1_F16      : WInst<"vld1", "dc", "hQh">;
-def VLD1_X2_F16   : WInst<"vld1_x2", "2c", "hQh">;
-def VLD1_X3_F16   : WInst<"vld1_x3", "3c", "hQh">;
-def VLD1_X4_F16   : WInst<"vld1_x4", "4c", "hQh">;
-def VLD1_LANE_F16 : WInst<"vld1_lane", "dcdi", "hQh">;
-def VLD1_DUP_F16  : WInst<"vld1_dup", "dc", "hQh">;
-def VST1_F16      : WInst<"vst1", "vpd", "hQh">;
-def VST1_X2_F16   : WInst<"vst1_x2", "vp2", "hQh">;
-def VST1_X3_F16   : WInst<"vst1_x3", "vp3", "hQh">;
-def VST1_X4_F16   : WInst<"vst1_x4", "vp4", "hQh">;
-def VST1_LANE_F16 : WInst<"vst1_lane", "vpdi", "hQh">;
+def VLD1_F16      : WInst<"vld1", ".(c*!)", "hQh">;
+def VLD1_X2_F16   : WInst<"vld1_x2", "2(c*!)", "hQh">;
+def VLD1_X3_F16   : WInst<"vld1_x3", "3(c*!)", "hQh">;
+def VLD1_X4_F16   : WInst<"vld1_x4", "4(c*!)", "hQh">;
+def VLD1_LANE_F16 : WInst<"vld1_lane", ".(c*!).I", "hQh">;
+def VLD1_DUP_F16  : WInst<"vld1_dup", ".(c*!)", "hQh">;
+def VST1_F16      : WInst<"vst1", "v*(.!)", "hQh">;
+def VST1_X2_F16   : WInst<"vst1_x2", "v*(2!)", "hQh">;
+def VST1_X3_F16   : WInst<"vst1_x3", "v*(3!)", "hQh">;
+def VST1_X4_F16   : WInst<"vst1_x4", "v*(4!)", "hQh">;
+def VST1_LANE_F16 : WInst<"vst1_lane", "v*(.!)I", "hQh">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.15 Loads and stores of an N-element structure
-def VLD2 : WInst<"vld2", "2c", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VLD3 : WInst<"vld3", "3c", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VLD4 : WInst<"vld4", "4c", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VLD2_DUP  : WInst<"vld2_dup", "2c",
+def VLD2 : WInst<"vld2", "2(c*!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VLD3 : WInst<"vld3", "3(c*!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VLD4 : WInst<"vld4", "4(c*!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VLD2_DUP  : WInst<"vld2_dup", "2(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
-def VLD3_DUP  : WInst<"vld3_dup", "3c",
+def VLD3_DUP  : WInst<"vld3_dup", "3(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
-def VLD4_DUP  : WInst<"vld4_dup", "4c",
+def VLD4_DUP  : WInst<"vld4_dup", "4(c*!)",
                       "UcUsUiUlcsilfPcPsQcQfQiQlQsQPcQPsQUcQUiQUlQUs">;
-def VLD2_LANE : WInst<"vld2_lane", "2c2i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD3_LANE : WInst<"vld3_lane", "3c3i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VLD4_LANE : WInst<"vld4_lane", "4c4i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST2 : WInst<"vst2", "vp2", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST3 : WInst<"vst3", "vp3", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST4 : WInst<"vst4", "vp4", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
-def VST2_LANE : WInst<"vst2_lane", "vp2i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST3_LANE : WInst<"vst3_lane", "vp3i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
-def VST4_LANE : WInst<"vst4_lane", "vp4i", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VLD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VLD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VLD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VST2 : WInst<"vst2", "v*(2!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VST3 : WInst<"vst3", "v*(3!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VST4 : WInst<"vst4", "v*(4!)", "QUcQUsQUiQcQsQiQfQPcQPsUcUsUiUlcsilfPcPs">;
+def VST2_LANE : WInst<"vst2_lane", "v*(2!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VST3_LANE : WInst<"vst3_lane", "v*(3!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
+def VST4_LANE : WInst<"vst4_lane", "v*(4!)I", "QUsQUiQsQiQfQPsUcUsUicsifPcPs">;
 let ArchGuard = "(__ARM_FP & 2)" in {
-def VLD2_F16      : WInst<"vld2", "2c", "hQh">;
-def VLD3_F16      : WInst<"vld3", "3c", "hQh">;
-def VLD4_F16      : WInst<"vld4", "4c", "hQh">;
-def VLD2_DUP_F16  : WInst<"vld2_dup", "2c", "hQh">;
-def VLD3_DUP_F16  : WInst<"vld3_dup", "3c", "hQh">;
-def VLD4_DUP_F16  : WInst<"vld4_dup", "4c", "hQh">;
-def VLD2_LANE_F16 : WInst<"vld2_lane", "2c2i", "hQh">;
-def VLD3_LANE_F16 : WInst<"vld3_lane", "3c3i", "hQh">;
-def VLD4_LANE_F16 : WInst<"vld4_lane", "4c4i", "hQh">;
-def VST2_F16      : WInst<"vst2", "vp2", "hQh">;
-def VST3_F16      : WInst<"vst3", "vp3", "hQh">;
-def VST4_F16      : WInst<"vst4", "vp4", "hQh">;
-def VST2_LANE_F16 : WInst<"vst2_lane", "vp2i", "hQh">;
-def VST3_LANE_F16 : WInst<"vst3_lane", "vp3i", "hQh">;
-def VST4_LANE_F16 : WInst<"vst4_lane", "vp4i", "hQh">;
+def VLD2_F16      : WInst<"vld2", "2(c*!)", "hQh">;
+def VLD3_F16      : WInst<"vld3", "3(c*!)", "hQh">;
+def VLD4_F16      : WInst<"vld4", "4(c*!)", "hQh">;
+def VLD2_DUP_F16  : WInst<"vld2_dup", "2(c*!)", "hQh">;
+def VLD3_DUP_F16  : WInst<"vld3_dup", "3(c*!)", "hQh">;
+def VLD4_DUP_F16  : WInst<"vld4_dup", "4(c*!)", "hQh">;
+def VLD2_LANE_F16 : WInst<"vld2_lane", "2(c*!)2I", "hQh">;
+def VLD3_LANE_F16 : WInst<"vld3_lane", "3(c*!)3I", "hQh">;
+def VLD4_LANE_F16 : WInst<"vld4_lane", "4(c*!)4I", "hQh">;
+def VST2_F16      : WInst<"vst2", "v*(2!)", "hQh">;
+def VST3_F16      : WInst<"vst3", "v*(3!)", "hQh">;
+def VST4_F16      : WInst<"vst4", "v*(4!)", "hQh">;
+def VST2_LANE_F16 : WInst<"vst2_lane", "v*(2!)I", "hQh">;
+def VST3_LANE_F16 : WInst<"vst3_lane", "v*(3!)I", "hQh">;
+def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.16 Extract lanes from a vector
 let InstName = "vmov" in
-def VGET_LANE : IInst<"vget_lane", "sdi",
+def VGET_LANE : IInst<"vget_lane", "1.I",
                       "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.17 Set lanes within a vector
 let InstName = "vmov" in
-def VSET_LANE : IInst<"vset_lane", "dsdi",
+def VSET_LANE : IInst<"vset_lane", ".1.I",
                       "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.18 Initialize a vector from bit pattern
-def VCREATE : NoTestOpInst<"vcreate", "dl", "csihfUcUsUiUlPcPsl", OP_CAST> {
+def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPsl", OP_CAST> {
   let BigEndianSafe = 1;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.19 Set all lanes to same value
 let InstName = "vmov" in {
-def VDUP_N   : WOpInst<"vdup_n", "ds",
+def VDUP_N   : WOpInst<"vdup_n", ".1",
                        "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl",
                        OP_DUP>;
-def VMOV_N   : WOpInst<"vmov_n", "ds",
+def VMOV_N   : WOpInst<"vmov_n", ".1",
                        "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl",
                        OP_DUP>;
 }
 let InstName = "" in
-def VDUP_LANE: WOpInst<"vdup_lane", "dgi",
+def VDUP_LANE: WOpInst<"vdup_lane", ".qI",
                        "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl",
                        OP_DUP_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.20 Combining vectors
-def VCOMBINE : NoTestOpInst<"vcombine", "kdd", "csilhfUcUsUiUlPcPs", OP_CONC>;
+def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPs", OP_CONC>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.21 Splitting vectors
@@ -468,127 +469,127 @@ def VCOMBINE : NoTestOpInst<"vcombine", "kdd", "csilhfUcUsUiUlPcPs", OP_CONC>;
 // versions of these intrinsics in both AArch32 and AArch64 architectures. See
 // D45668 for more details.
 let InstName = "vmov" in {
-def VGET_HIGH : NoTestOpInst<"vget_high", "dk", "csilhfUcUsUiUlPcPs", OP_HI>;
-def VGET_LOW  : NoTestOpInst<"vget_low", "dk", "csilhfUcUsUiUlPcPs", OP_LO>;
+def VGET_HIGH : NoTestOpInst<"vget_high", ".Q", "csilhfUcUsUiUlPcPs", OP_HI>;
+def VGET_LOW  : NoTestOpInst<"vget_low", ".Q", "csilhfUcUsUiUlPcPs", OP_LO>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.22 Converting vectors
 
 let ArchGuard = "(__ARM_FP & 2)" in {
-  def VCVT_F16_F32 : SInst<"vcvt_f16_f32", "md", "Hf">;
-  def VCVT_F32_F16 : SInst<"vcvt_f32_f16", "wd", "h">;
+  def VCVT_F16_F32 : SInst<"vcvt_f16_f32", "(<q)(.!)", "Hf">;
+  def VCVT_F32_F16 : SInst<"vcvt_f32_f16", "(>Q)(.!)", "h">;
 }
 
-def VCVT_S32     : SInst<"vcvt_s32", "xd",  "fQf">;
-def VCVT_U32     : SInst<"vcvt_u32", "ud",  "fQf">;
-def VCVT_F32     : SInst<"vcvt_f32", "fd",  "iUiQiQUi">;
+def VCVT_S32     : SInst<"vcvt_s32", "S.",  "fQf">;
+def VCVT_U32     : SInst<"vcvt_u32", "U.",  "fQf">;
+def VCVT_F32     : SInst<"vcvt_f32", "F(.!)",  "iUiQiQUi">;
 let isVCVT_N = 1 in {
-def VCVT_N_S32   : SInst<"vcvt_n_s32", "xdi", "fQf">;
-def VCVT_N_U32   : SInst<"vcvt_n_u32", "udi", "fQf">;
-def VCVT_N_F32   : SInst<"vcvt_n_f32", "fdi", "iUiQiQUi">;
+def VCVT_N_S32   : SInst<"vcvt_n_s32", "S.I", "fQf">;
+def VCVT_N_U32   : SInst<"vcvt_n_u32", "U.I", "fQf">;
+def VCVT_N_F32   : SInst<"vcvt_n_f32", "F(.!)I", "iUiQiQUi">;
 }
 
-def VMOVN        : IInst<"vmovn", "hk",  "silUsUiUl">;
-def VMOVL        : SInst<"vmovl", "wd",  "csiUcUsUi">;
-def VQMOVN       : SInst<"vqmovn", "hk",  "silUsUiUl">;
-def VQMOVUN      : SInst<"vqmovun", "ek",  "sil">;
+def VMOVN        : IInst<"vmovn", "<Q",  "silUsUiUl">;
+def VMOVL        : SInst<"vmovl", "(>Q).",  "csiUcUsUi">;
+def VQMOVN       : SInst<"vqmovn", "<Q",  "silUsUiUl">;
+def VQMOVUN      : SInst<"vqmovun", "(<U)Q",  "sil">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.23-24 Table lookup, Extended table lookup
 let InstName = "vtbl" in {
-def VTBL1 : WInst<"vtbl1", "ddt",  "UccPc">;
-def VTBL2 : WInst<"vtbl2", "d2t",  "UccPc">;
-def VTBL3 : WInst<"vtbl3", "d3t",  "UccPc">;
-def VTBL4 : WInst<"vtbl4", "d4t",  "UccPc">;
+def VTBL1 : WInst<"vtbl1", "..p",  "UccPc">;
+def VTBL2 : WInst<"vtbl2", ".2p",  "UccPc">;
+def VTBL3 : WInst<"vtbl3", ".3p",  "UccPc">;
+def VTBL4 : WInst<"vtbl4", ".4p",  "UccPc">;
 }
 let InstName = "vtbx" in {
-def VTBX1 : WInst<"vtbx1", "dddt", "UccPc">;
-def VTBX2 : WInst<"vtbx2", "dd2t", "UccPc">;
-def VTBX3 : WInst<"vtbx3", "dd3t", "UccPc">;
-def VTBX4 : WInst<"vtbx4", "dd4t", "UccPc">;
+def VTBX1 : WInst<"vtbx1", "...p", "UccPc">;
+def VTBX2 : WInst<"vtbx2", "..2p", "UccPc">;
+def VTBX3 : WInst<"vtbx3", "..3p", "UccPc">;
+def VTBX4 : WInst<"vtbx4", "..4p", "UccPc">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.25 Operations with a scalar value
-def VMLA_LANE     : IOpInst<"vmla_lane", "dddgi",
+def VMLA_LANE     : IOpInst<"vmla_lane", "...qI",
                             "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
-def VMLAL_LANE    : SOpInst<"vmlal_lane", "wwddi", "siUsUi", OP_MLAL_LN>;
-def VQDMLAL_LANE  : SOpInst<"vqdmlal_lane", "wwddi", "si", OP_QDMLAL_LN>;
-def VMLS_LANE     : IOpInst<"vmls_lane", "dddgi",
+def VMLAL_LANE    : SOpInst<"vmlal_lane", "(>Q)(>Q)..I", "siUsUi", OP_MLAL_LN>;
+def VQDMLAL_LANE  : SOpInst<"vqdmlal_lane", "(>Q)(>Q)..I", "si", OP_QDMLAL_LN>;
+def VMLS_LANE     : IOpInst<"vmls_lane", "...qI",
                             "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
-def VMLSL_LANE    : SOpInst<"vmlsl_lane", "wwddi", "siUsUi", OP_MLSL_LN>;
-def VQDMLSL_LANE  : SOpInst<"vqdmlsl_lane", "wwddi", "si", OP_QDMLSL_LN>;
-def VMUL_N        : IOpInst<"vmul_n", "dds", "sifUsUiQsQiQfQUsQUi", OP_MUL_N>;
-def VMUL_LANE     : IOpInst<"vmul_lane", "ddgi",
+def VMLSL_LANE    : SOpInst<"vmlsl_lane", "(>Q)(>Q)..I", "siUsUi", OP_MLSL_LN>;
+def VQDMLSL_LANE  : SOpInst<"vqdmlsl_lane", "(>Q)(>Q)..I", "si", OP_QDMLSL_LN>;
+def VMUL_N        : IOpInst<"vmul_n", "..1", "sifUsUiQsQiQfQUsQUi", OP_MUL_N>;
+def VMUL_LANE     : IOpInst<"vmul_lane", "..qI",
                             "sifUsUiQsQiQfQUsQUi", OP_MUL_LN>;
-def VMULL_N       : SOpInst<"vmull_n", "wds", "siUsUi", OP_MULL_N>;
-def VMULL_LANE    : SOpInst<"vmull_lane", "wddi", "siUsUi", OP_MULL_LN>;
-def VQDMULL_N     : SOpInst<"vqdmull_n", "wds", "si", OP_QDMULL_N>;
-def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "wddi", "si", OP_QDMULL_LN>;
-def VQDMULH_N     : SOpInst<"vqdmulh_n", "dds", "siQsQi", OP_QDMULH_N>;
-def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "ddgi", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "dds", "siQsQi", OP_QRDMULH_N>;
-def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "ddgi", "siQsQi", OP_QRDMULH_LN>;
+def VMULL_N       : SOpInst<"vmull_n", "(>Q).1", "siUsUi", OP_MULL_N>;
+def VMULL_LANE    : SOpInst<"vmull_lane", "(>Q)..I", "siUsUi", OP_MULL_LN>;
+def VQDMULL_N     : SOpInst<"vqdmull_n", "(>Q).1", "si", OP_QDMULL_N>;
+def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
+def VQDMULH_N     : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
+def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
+def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
+def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in {
-def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "dddgi", "siQsQi", OP_QRDMLAH_LN>;
-def VQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "dddgi", "siQsQi", OP_QRDMLSH_LN>;
+def VQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "...qI", "siQsQi", OP_QRDMLAH_LN>;
+def VQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "...qI", "siQsQi", OP_QRDMLSH_LN>;
 }
 
-def VMLA_N        : IOpInst<"vmla_n", "ddds", "siUsUifQsQiQUsQUiQf", OP_MLA_N>;
-def VMLAL_N       : SOpInst<"vmlal_n", "wwds", "siUsUi", OP_MLAL_N>;
-def VQDMLAL_N     : SOpInst<"vqdmlal_n", "wwds", "si", OP_QDMLAL_N>;
-def VMLS_N        : IOpInst<"vmls_n", "ddds", "siUsUifQsQiQUsQUiQf", OP_MLS_N>;
-def VMLSL_N       : SOpInst<"vmlsl_n", "wwds", "siUsUi", OP_MLSL_N>;
-def VQDMLSL_N     : SOpInst<"vqdmlsl_n", "wwds", "si", OP_QDMLSL_N>;
+def VMLA_N        : IOpInst<"vmla_n", "...1", "siUsUifQsQiQUsQUiQf", OP_MLA_N>;
+def VMLAL_N       : SOpInst<"vmlal_n", "(>Q)(>Q).1", "siUsUi", OP_MLAL_N>;
+def VQDMLAL_N     : SOpInst<"vqdmlal_n", "(>Q)(>Q).1", "si", OP_QDMLAL_N>;
+def VMLS_N        : IOpInst<"vmls_n", "...1", "siUsUifQsQiQUsQUiQf", OP_MLS_N>;
+def VMLSL_N       : SOpInst<"vmlsl_n", "(>Q)(>Q).1", "siUsUi", OP_MLSL_N>;
+def VQDMLSL_N     : SOpInst<"vqdmlsl_n", "(>Q)(>Q).1", "si", OP_QDMLSL_N>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.26 Vector Extract
-def VEXT : WInst<"vext", "dddi",
+def VEXT : WInst<"vext", "...I",
                  "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.27 Reverse vector elements
-def VREV64 : WOpInst<"vrev64", "dd", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf",
+def VREV64 : WOpInst<"vrev64", "..", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf",
                   OP_REV64>;
-def VREV32 : WOpInst<"vrev32", "dd", "csUcUsPcPsQcQsQUcQUsQPcQPs", OP_REV32>;
-def VREV16 : WOpInst<"vrev16", "dd", "cUcPcQcQUcQPc", OP_REV16>;
+def VREV32 : WOpInst<"vrev32", "..", "csUcUsPcPsQcQsQUcQUsQPcQPs", OP_REV32>;
+def VREV16 : WOpInst<"vrev16", "..", "cUcPcQcQUcQPc", OP_REV16>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.28 Other single operand arithmetic
-def VABS    : SInst<"vabs", "dd", "csifQcQsQiQf">;
-def VQABS   : SInst<"vqabs", "dd", "csiQcQsQi">;
-def VNEG    : SOpInst<"vneg", "dd", "csifQcQsQiQf", OP_NEG>;
-def VQNEG   : SInst<"vqneg", "dd", "csiQcQsQi">;
-def VCLS    : SInst<"vcls", "dd", "csiQcQsQi">;
-def VCLZ    : IInst<"vclz", "dd", "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VCNT    : WInst<"vcnt", "dd", "UccPcQUcQcQPc">;
-def VRECPE  : SInst<"vrecpe", "dd", "fUiQfQUi">;
-def VRSQRTE : SInst<"vrsqrte", "dd", "fUiQfQUi">;
+def VABS    : SInst<"vabs", "..", "csifQcQsQiQf">;
+def VQABS   : SInst<"vqabs", "..", "csiQcQsQi">;
+def VNEG    : SOpInst<"vneg", "..", "csifQcQsQiQf", OP_NEG>;
+def VQNEG   : SInst<"vqneg", "..", "csiQcQsQi">;
+def VCLS    : SInst<"vcls", "..", "csiQcQsQi">;
+def VCLZ    : IInst<"vclz", "..", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VCNT    : WInst<"vcnt", "..", "UccPcQUcQcQPc">;
+def VRECPE  : SInst<"vrecpe", "..", "fUiQfQUi">;
+def VRSQRTE : SInst<"vrsqrte", "..", "fUiQfQUi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.29 Logical operations
-def VMVN : LOpInst<"vmvn", "dd", "csiUcUsUiPcQcQsQiQUcQUsQUiQPc", OP_NOT>;
-def VAND : LOpInst<"vand", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_AND>;
-def VORR : LOpInst<"vorr", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_OR>;
-def VEOR : LOpInst<"veor", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_XOR>;
-def VBIC : LOpInst<"vbic", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>;
-def VORN : LOpInst<"vorn", "ddd", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>;
+def VMVN : LOpInst<"vmvn", "..", "csiUcUsUiPcQcQsQiQUcQUsQUiQPc", OP_NOT>;
+def VAND : LOpInst<"vand", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_AND>;
+def VORR : LOpInst<"vorr", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_OR>;
+def VEOR : LOpInst<"veor", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_XOR>;
+def VBIC : LOpInst<"vbic", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>;
+def VORN : LOpInst<"vorn", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>;
 let isHiddenLInst = 1 in
-def VBSL : SInst<"vbsl", "dudd",
+def VBSL : SInst<"vbsl", ".U..",
                 "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPs">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.30 Transposition operations
-def VTRN : WInst<"vtrn", "2dd", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
-def VZIP : WInst<"vzip", "2dd", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
-def VUZP : WInst<"vuzp", "2dd", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
+def VTRN : WInst<"vtrn", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
+def VZIP : WInst<"vzip", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
+def VUZP : WInst<"vuzp", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // E.3.31 Vector reinterpret cast operations
 def VREINTERPRET
-  : NoTestOpInst<"vreinterpret", "dd",
+  : NoTestOpInst<"vreinterpret", "..",
          "csilUcUsUiUlhfPcPsQcQsQiQlQUcQUsQUiQUlQhQfQPcQPs", OP_REINT> {
   let CartesianProductOfTypes = 1;
   let ArchGuard = "!defined(__aarch64__)";
@@ -599,17 +600,17 @@ def VREINTERPRET
 // Vector fused multiply-add operations
 
 let ArchGuard = "defined(__ARM_FEATURE_FMA)" in {
-  def VFMA : SInst<"vfma", "dddd", "fQf">;
-  def VFMS : SOpInst<"vfms", "dddd", "fQf", OP_FMLS>;
-  def FMLA_N_F32 : SOpInst<"vfma_n", "ddds", "fQf", OP_FMLA_N>;
+  def VFMA : SInst<"vfma", "....", "fQf">;
+  def VFMS : SOpInst<"vfms", "....", "fQf", OP_FMLS>;
+  def FMLA_N_F32 : SOpInst<"vfma_n", "...1", "fQf", OP_FMLA_N>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // fp16 vector operations
-def SCALAR_HALF_GET_LANE : IOpInst<"vget_lane", "sdi", "h", OP_SCALAR_HALF_GET_LN>;
-def SCALAR_HALF_SET_LANE : IOpInst<"vset_lane", "dsdi", "h", OP_SCALAR_HALF_SET_LN>;
-def SCALAR_HALF_GET_LANEQ : IOpInst<"vget_lane", "sdi", "Qh", OP_SCALAR_HALF_GET_LNQ>;
-def SCALAR_HALF_SET_LANEQ : IOpInst<"vset_lane", "dsdi", "Qh", OP_SCALAR_HALF_SET_LNQ>;
+def SCALAR_HALF_GET_LANE : IOpInst<"vget_lane", "1.I", "h", OP_SCALAR_HALF_GET_LN>;
+def SCALAR_HALF_SET_LANE : IOpInst<"vset_lane", ".1.I", "h", OP_SCALAR_HALF_SET_LN>;
+def SCALAR_HALF_GET_LANEQ : IOpInst<"vget_lane", "1.I", "Qh", OP_SCALAR_HALF_GET_LNQ>;
+def SCALAR_HALF_SET_LANEQ : IOpInst<"vset_lane", ".1.I", "Qh", OP_SCALAR_HALF_SET_LNQ>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // AArch64 Intrinsics
@@ -618,474 +619,474 @@ let ArchGuard = "defined(__aarch64__)" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Load/Store
-def LD1 : WInst<"vld1", "dc", "dQdPlQPl">;
-def LD2 : WInst<"vld2", "2c", "QUlQldQdPlQPl">;
-def LD3 : WInst<"vld3", "3c", "QUlQldQdPlQPl">;
-def LD4 : WInst<"vld4", "4c", "QUlQldQdPlQPl">;
-def ST1 : WInst<"vst1", "vpd", "dQdPlQPl">;
-def ST2 : WInst<"vst2", "vp2", "QUlQldQdPlQPl">;
-def ST3 : WInst<"vst3", "vp3", "QUlQldQdPlQPl">;
-def ST4 : WInst<"vst4", "vp4", "QUlQldQdPlQPl">;
-
-def LD1_X2 : WInst<"vld1_x2", "2c",
+def LD1 : WInst<"vld1", ".(c*!)", "dQdPlQPl">;
+def LD2 : WInst<"vld2", "2(c*!)", "QUlQldQdPlQPl">;
+def LD3 : WInst<"vld3", "3(c*!)", "QUlQldQdPlQPl">;
+def LD4 : WInst<"vld4", "4(c*!)", "QUlQldQdPlQPl">;
+def ST1 : WInst<"vst1", "v*(.!)", "dQdPlQPl">;
+def ST2 : WInst<"vst2", "v*(2!)", "QUlQldQdPlQPl">;
+def ST3 : WInst<"vst3", "v*(3!)", "QUlQldQdPlQPl">;
+def ST4 : WInst<"vst4", "v*(4!)", "QUlQldQdPlQPl">;
+
+def LD1_X2 : WInst<"vld1_x2", "2(c*!)",
                    "dQdPlQPl">;
-def LD1_X3 : WInst<"vld1_x3", "3c",
+def LD1_X3 : WInst<"vld1_x3", "3(c*!)",
                    "dQdPlQPl">;
-def LD1_X4 : WInst<"vld1_x4", "4c",
+def LD1_X4 : WInst<"vld1_x4", "4(c*!)",
                    "dQdPlQPl">;
 
-def ST1_X2 : WInst<"vst1_x2", "vp2", "dQdPlQPl">;
-def ST1_X3 : WInst<"vst1_x3", "vp3", "dQdPlQPl">;
-def ST1_X4 : WInst<"vst1_x4", "vp4", "dQdPlQPl">;
+def ST1_X2 : WInst<"vst1_x2", "v*(2!)", "dQdPlQPl">;
+def ST1_X3 : WInst<"vst1_x3", "v*(3!)", "dQdPlQPl">;
+def ST1_X4 : WInst<"vst1_x4", "v*(4!)", "dQdPlQPl">;
 
-def LD1_LANE : WInst<"vld1_lane", "dcdi", "dQdPlQPl">;
-def LD2_LANE : WInst<"vld2_lane", "2c2i", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD3_LANE : WInst<"vld3_lane", "3c3i", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def LD4_LANE : WInst<"vld4_lane", "4c4i", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST1_LANE : WInst<"vst1_lane", "vpdi", "dQdPlQPl">;
-def ST2_LANE : WInst<"vst2_lane", "vp2i", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST3_LANE : WInst<"vst3_lane", "vp3i", "lUlQcQUcQPcQlQUldQdPlQPl">;
-def ST4_LANE : WInst<"vst4_lane", "vp4i", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def LD1_LANE : WInst<"vld1_lane", ".(c*!).I", "dQdPlQPl">;
+def LD2_LANE : WInst<"vld2_lane", "2(c*!)2I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def LD3_LANE : WInst<"vld3_lane", "3(c*!)3I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def LD4_LANE : WInst<"vld4_lane", "4(c*!)4I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def ST1_LANE : WInst<"vst1_lane", "v*(.!)I", "dQdPlQPl">;
+def ST2_LANE : WInst<"vst2_lane", "v*(2!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def ST3_LANE : WInst<"vst3_lane", "v*(3!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
+def ST4_LANE : WInst<"vst4_lane", "v*(4!)I", "lUlQcQUcQPcQlQUldQdPlQPl">;
 
-def LD1_DUP  : WInst<"vld1_dup", "dc", "dQdPlQPl">;
-def LD2_DUP  : WInst<"vld2_dup", "2c", "dQdPlQPl">;
-def LD3_DUP  : WInst<"vld3_dup", "3c", "dQdPlQPl">;
-def LD4_DUP  : WInst<"vld4_dup", "4c", "dQdPlQPl">;
+def LD1_DUP  : WInst<"vld1_dup", ".(c*!)", "dQdPlQPl">;
+def LD2_DUP  : WInst<"vld2_dup", "2(c*!)", "dQdPlQPl">;
+def LD3_DUP  : WInst<"vld3_dup", "3(c*!)", "dQdPlQPl">;
+def LD4_DUP  : WInst<"vld4_dup", "4(c*!)", "dQdPlQPl">;
 
-def VLDRQ : WInst<"vldrq", "sc", "Pk">;
-def VSTRQ : WInst<"vstrq", "vps", "Pk">;
+def VLDRQ : WInst<"vldrq", "1(c*!)", "Pk">;
+def VSTRQ : WInst<"vstrq", "v*(1!)", "Pk">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Addition
-def ADD : IOpInst<"vadd", "ddd", "dQd", OP_ADD>;
+def ADD : IOpInst<"vadd", "...", "dQd", OP_ADD>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Subtraction
-def SUB : IOpInst<"vsub", "ddd", "dQd", OP_SUB>;
+def SUB : IOpInst<"vsub", "...", "dQd", OP_SUB>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Multiplication
-def MUL     : IOpInst<"vmul", "ddd", "dQd", OP_MUL>;
-def MLA     : IOpInst<"vmla", "dddd", "dQd", OP_MLA>;
-def MLS     : IOpInst<"vmls", "dddd", "dQd", OP_MLS>;
+def MUL     : IOpInst<"vmul", "...", "dQd", OP_MUL>;
+def MLA     : IOpInst<"vmla", "....", "dQd", OP_MLA>;
+def MLS     : IOpInst<"vmls", "....", "dQd", OP_MLS>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Multiplication Extended
-def MULX : SInst<"vmulx", "ddd", "fdQfQd">;
+def MULX : SInst<"vmulx", "...", "fdQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Division
-def FDIV : IOpInst<"vdiv", "ddd",  "fdQfQd", OP_DIV>;
+def FDIV : IOpInst<"vdiv", "...",  "fdQfQd", OP_DIV>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Vector fused multiply-add operations
-def FMLA : SInst<"vfma", "dddd", "dQd">;
-def FMLS : SOpInst<"vfms", "dddd", "dQd", OP_FMLS>;
+def FMLA : SInst<"vfma", "....", "dQd">;
+def FMLS : SOpInst<"vfms", "....", "dQd", OP_FMLS>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // MUL, MLA, MLS, FMA, FMS definitions with scalar argument
-def VMUL_N_A64 : IOpInst<"vmul_n", "dds", "Qd", OP_MUL_N>;
+def VMUL_N_A64 : IOpInst<"vmul_n", "..1", "Qd", OP_MUL_N>;
 
-def FMLA_N : SOpInst<"vfma_n", "ddds", "dQd", OP_FMLA_N>;
-def FMLS_N : SOpInst<"vfms_n", "ddds", "fdQfQd", OP_FMLS_N>;
+def FMLA_N : SOpInst<"vfma_n", "...1", "dQd", OP_FMLA_N>;
+def FMLS_N : SOpInst<"vfms_n", "...1", "fdQfQd", OP_FMLS_N>;
 
-def MLA_N : SOpInst<"vmla_n", "ddds", "Qd", OP_MLA_N>;
-def MLS_N : SOpInst<"vmls_n", "ddds", "Qd", OP_MLS_N>;
+def MLA_N : SOpInst<"vmla_n", "...1", "Qd", OP_MLA_N>;
+def MLS_N : SOpInst<"vmls_n", "...1", "Qd", OP_MLS_N>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Logical operations
-def BSL : SInst<"vbsl", "dudd", "dPlQdQPl">;
+def BSL : SInst<"vbsl", ".U..", "dPlQdQPl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Absolute Difference
-def ABD  : SInst<"vabd", "ddd",  "dQd">;
+def ABD  : SInst<"vabd", "...",  "dQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // saturating absolute/negate
-def ABS    : SInst<"vabs", "dd", "dQdlQl">;
-def QABS   : SInst<"vqabs", "dd", "lQl">;
-def NEG    : SOpInst<"vneg", "dd", "dlQdQl", OP_NEG>;
-def QNEG   : SInst<"vqneg", "dd", "lQl">;
+def ABS    : SInst<"vabs", "..", "dQdlQl">;
+def QABS   : SInst<"vqabs", "..", "lQl">;
+def NEG    : SOpInst<"vneg", "..", "dlQdQl", OP_NEG>;
+def QNEG   : SInst<"vqneg", "..", "lQl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Accumulated of Unsigned Value
-def SUQADD : SInst<"vuqadd", "ddu", "csilQcQsQiQl">;
+def SUQADD : SInst<"vuqadd", "..U", "csilQcQsQiQl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Unsigned Saturating Accumulated of Signed Value
-def USQADD : SInst<"vsqadd", "ddx", "UcUsUiUlQUcQUsQUiQUl">;
+def USQADD : SInst<"vsqadd", "..S", "UcUsUiUlQUcQUsQUiQUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Reciprocal/Sqrt
-def FRECPS  : IInst<"vrecps", "ddd", "dQd">;
-def FRSQRTS : IInst<"vrsqrts", "ddd", "dQd">;
-def FRECPE  : SInst<"vrecpe", "dd", "dQd">;
-def FRSQRTE : SInst<"vrsqrte", "dd", "dQd">;
-def FSQRT   : SInst<"vsqrt", "dd", "fdQfQd">;
+def FRECPS  : IInst<"vrecps", "...", "dQd">;
+def FRSQRTS : IInst<"vrsqrts", "...", "dQd">;
+def FRECPE  : SInst<"vrecpe", "..", "dQd">;
+def FRSQRTE : SInst<"vrsqrte", "..", "dQd">;
+def FSQRT   : SInst<"vsqrt", "..", "fdQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // bitwise reverse
-def RBIT : IInst<"vrbit", "dd", "cUcPcQcQUcQPc">;
+def RBIT : IInst<"vrbit", "..", "cUcPcQcQUcQPc">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer extract and narrow to high
-def XTN2 : SOpInst<"vmovn_high", "qhk", "silUsUiUl", OP_XTN>;
+def XTN2 : SOpInst<"vmovn_high", "(<Q)<Q", "silUsUiUl", OP_XTN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed integer saturating extract and unsigned narrow to high
-def SQXTUN2 : SOpInst<"vqmovun_high", "emd", "HsHiHl", OP_SQXTUN>;
+def SQXTUN2 : SOpInst<"vqmovun_high", "(<U)(<q).", "HsHiHl", OP_SQXTUN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Integer saturating extract and narrow to high
-def QXTN2 : SOpInst<"vqmovn_high", "qhk", "silUsUiUl", OP_QXTN>;
+def QXTN2 : SOpInst<"vqmovn_high", "(<Q)<Q", "silUsUiUl", OP_QXTN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Converting vectors
 
-def VCVT_F32_F64 : SInst<"vcvt_f32_f64", "md", "Qd">;
-def VCVT_F64_F32 : SInst<"vcvt_f64_f32", "wd", "f">;
+def VCVT_F32_F64 : SInst<"vcvt_f32_f64", "(<q).", "Qd">;
+def VCVT_F64_F32 : SInst<"vcvt_f64_f32", "(>Q).", "f">;
 
-def VCVT_S64 : SInst<"vcvt_s64", "xd",  "dQd">;
-def VCVT_U64 : SInst<"vcvt_u64", "ud",  "dQd">;
-def VCVT_F64 : SInst<"vcvt_f64", "Fd",  "lUlQlQUl">;
+def VCVT_S64 : SInst<"vcvt_s64", "S.",  "dQd">;
+def VCVT_U64 : SInst<"vcvt_u64", "U.",  "dQd">;
+def VCVT_F64 : SInst<"vcvt_f64", "F(.!)",  "lUlQlQUl">;
 
-def VCVT_HIGH_F16_F32 : SOpInst<"vcvt_high_f16", "hmj", "Hf", OP_VCVT_NA_HI_F16>;
-def VCVT_HIGH_F32_F16 : SOpInst<"vcvt_high_f32", "wk", "h", OP_VCVT_EX_HI_F32>;
-def VCVT_HIGH_F32_F64 : SOpInst<"vcvt_high_f32", "qfj", "d", OP_VCVT_NA_HI_F32>;
-def VCVT_HIGH_F64_F32 : SOpInst<"vcvt_high_f64", "wj", "f", OP_VCVT_EX_HI_F64>;
+def VCVT_HIGH_F16_F32 : SOpInst<"vcvt_high_f16", "<(<q!)Q", "Hf", OP_VCVT_NA_HI_F16>;
+def VCVT_HIGH_F32_F16 : SOpInst<"vcvt_high_f32", "(>Q)(Q!)", "h", OP_VCVT_EX_HI_F32>;
+def VCVT_HIGH_F32_F64 : SOpInst<"vcvt_high_f32", "(<Q)(F<!)Q", "d", OP_VCVT_NA_HI_F32>;
+def VCVT_HIGH_F64_F32 : SOpInst<"vcvt_high_f64", "(>Q)(Q!)", "f", OP_VCVT_EX_HI_F64>;
 
-def VCVTX_F32_F64      : SInst<"vcvtx_f32", "fj",  "d">;
-def VCVTX_HIGH_F32_F64 : SOpInst<"vcvtx_high_f32", "qfj", "d", OP_VCVTX_HI>;
+def VCVTX_F32_F64      : SInst<"vcvtx_f32", "(F<)(Q!)",  "d">;
+def VCVTX_HIGH_F32_F64 : SOpInst<"vcvtx_high_f32", "(<Q)(F<!)Q", "d", OP_VCVTX_HI>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Comparison
-def FCAGE : IInst<"vcage", "udd", "dQd">;
-def FCAGT : IInst<"vcagt", "udd", "dQd">;
-def FCALE : IInst<"vcale", "udd", "dQd">;
-def FCALT : IInst<"vcalt", "udd", "dQd">;
-def CMTST  : WInst<"vtst", "udd", "lUlPlQlQUlQPl">;
-def CFMEQ  : SOpInst<"vceq", "udd", "lUldQdQlQUlPlQPl", OP_EQ>;
-def CFMGE  : SOpInst<"vcge", "udd", "lUldQdQlQUl", OP_GE>;
-def CFMLE  : SOpInst<"vcle", "udd", "lUldQdQlQUl", OP_LE>;
-def CFMGT  : SOpInst<"vcgt", "udd", "lUldQdQlQUl", OP_GT>;
-def CFMLT  : SOpInst<"vclt", "udd", "lUldQdQlQUl", OP_LT>;
-
-def CMEQ  : SInst<"vceqz", "ud",
+def FCAGE : IInst<"vcage", "U..", "dQd">;
+def FCAGT : IInst<"vcagt", "U..", "dQd">;
+def FCALE : IInst<"vcale", "U..", "dQd">;
+def FCALT : IInst<"vcalt", "U..", "dQd">;
+def CMTST  : WInst<"vtst", "U..", "lUlPlQlQUlQPl">;
+def CFMEQ  : SOpInst<"vceq", "U..", "lUldQdQlQUlPlQPl", OP_EQ>;
+def CFMGE  : SOpInst<"vcge", "U..", "lUldQdQlQUl", OP_GE>;
+def CFMLE  : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>;
+def CFMGT  : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>;
+def CFMLT  : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>;
+
+def CMEQ  : SInst<"vceqz", "U.",
                   "csilfUcUsUiUlPcPsPlQcQsQiQlQfQUcQUsQUiQUlQPcQPsdQdQPl">;
-def CMGE  : SInst<"vcgez", "ud", "csilfdQcQsQiQlQfQd">;
-def CMLE  : SInst<"vclez", "ud", "csilfdQcQsQiQlQfQd">;
-def CMGT  : SInst<"vcgtz", "ud", "csilfdQcQsQiQlQfQd">;
-def CMLT  : SInst<"vcltz", "ud", "csilfdQcQsQiQlQfQd">;
+def CMGE  : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">;
+def CMLE  : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">;
+def CMGT  : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">;
+def CMLT  : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Max/Min Integer
-def MAX : SInst<"vmax", "ddd", "dQd">;
-def MIN : SInst<"vmin", "ddd", "dQd">;
+def MAX : SInst<"vmax", "...", "dQd">;
+def MIN : SInst<"vmin", "...", "dQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Pairwise Max/Min
-def MAXP : SInst<"vpmax", "ddd", "QcQsQiQUcQUsQUiQfQd">;
-def MINP : SInst<"vpmin", "ddd", "QcQsQiQUcQUsQUiQfQd">;
+def MAXP : SInst<"vpmax", "...", "QcQsQiQUcQUsQUiQfQd">;
+def MINP : SInst<"vpmin", "...", "QcQsQiQUcQUsQUiQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Pairwise MaxNum/MinNum Floating Point
-def FMAXNMP : SInst<"vpmaxnm", "ddd", "fQfQd">;
-def FMINNMP : SInst<"vpminnm", "ddd", "fQfQd">;
+def FMAXNMP : SInst<"vpmaxnm", "...", "fQfQd">;
+def FMINNMP : SInst<"vpminnm", "...", "fQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Pairwise Addition
-def ADDP  : IInst<"vpadd", "ddd", "QcQsQiQlQUcQUsQUiQUlQfQd">;
+def ADDP  : IInst<"vpadd", "...", "QcQsQiQlQUcQUsQUiQUlQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Shifts by constant
 let isShift = 1 in {
 // Left shift long high
-def SHLL_HIGH_N    : SOpInst<"vshll_high_n", "ndi", "HcHsHiHUcHUsHUi",
+def SHLL_HIGH_N    : SOpInst<"vshll_high_n", ">.I", "HcHsHiHUcHUsHUi",
                              OP_LONG_HI>;
 
 ////////////////////////////////////////////////////////////////////////////////
-def SRI_N : WInst<"vsri_n", "dddi", "PlQPl">;
-def SLI_N : WInst<"vsli_n", "dddi", "PlQPl">;
+def SRI_N : WInst<"vsri_n", "...I", "PlQPl">;
+def SLI_N : WInst<"vsli_n", "...I", "PlQPl">;
 
 // Right shift narrow high
-def SHRN_HIGH_N    : IOpInst<"vshrn_high_n", "hmdi",
+def SHRN_HIGH_N    : IOpInst<"vshrn_high_n", "<(<q).I",
                              "HsHiHlHUsHUiHUl", OP_NARROW_HI>;
-def QSHRUN_HIGH_N  : SOpInst<"vqshrun_high_n", "hmdi",
+def QSHRUN_HIGH_N  : SOpInst<"vqshrun_high_n", "<(<q).I",
                              "HsHiHl", OP_NARROW_HI>;
-def RSHRN_HIGH_N   : IOpInst<"vrshrn_high_n", "hmdi",
+def RSHRN_HIGH_N   : IOpInst<"vrshrn_high_n", "<(<q).I",
                              "HsHiHlHUsHUiHUl", OP_NARROW_HI>;
-def QRSHRUN_HIGH_N : SOpInst<"vqrshrun_high_n", "hmdi",
+def QRSHRUN_HIGH_N : SOpInst<"vqrshrun_high_n", "<(<q).I",
                              "HsHiHl", OP_NARROW_HI>;
-def QSHRN_HIGH_N   : SOpInst<"vqshrn_high_n", "hmdi",
+def QSHRN_HIGH_N   : SOpInst<"vqshrn_high_n", "<(<q).I",
                              "HsHiHlHUsHUiHUl", OP_NARROW_HI>;
-def QRSHRN_HIGH_N  : SOpInst<"vqrshrn_high_n", "hmdi",
+def QRSHRN_HIGH_N  : SOpInst<"vqrshrn_high_n", "<(<q).I",
                              "HsHiHlHUsHUiHUl", OP_NARROW_HI>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Converting vectors
-def VMOVL_HIGH   : SOpInst<"vmovl_high", "nd", "HcHsHiHUcHUsHUi", OP_MOVL_HI>;
+def VMOVL_HIGH   : SOpInst<"vmovl_high", ">.", "HcHsHiHUcHUsHUi", OP_MOVL_HI>;
 
 let isVCVT_N = 1 in {
-def CVTF_N_F64   : SInst<"vcvt_n_f64", "Fdi", "lUlQlQUl">;
-def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "xdi", "dQd">;
-def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "udi", "dQd">;
+def CVTF_N_F64   : SInst<"vcvt_n_f64", "F(.!)I", "lUlQlQUl">;
+def FCVTZS_N_S64 : SInst<"vcvt_n_s64", "S.I", "dQd">;
+def FCVTZS_N_U64 : SInst<"vcvt_n_u64", "U.I", "dQd">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // 3VDiff class using high 64-bit in operands
-def VADDL_HIGH   : SOpInst<"vaddl_high", "wkk", "csiUcUsUi", OP_ADDLHi>;
-def VADDW_HIGH   : SOpInst<"vaddw_high", "wwk", "csiUcUsUi", OP_ADDWHi>;
-def VSUBL_HIGH   : SOpInst<"vsubl_high", "wkk", "csiUcUsUi", OP_SUBLHi>;
-def VSUBW_HIGH   : SOpInst<"vsubw_high", "wwk", "csiUcUsUi", OP_SUBWHi>;
+def VADDL_HIGH   : SOpInst<"vaddl_high", "(>Q)QQ", "csiUcUsUi", OP_ADDLHi>;
+def VADDW_HIGH   : SOpInst<"vaddw_high", "(>Q)(>Q)Q", "csiUcUsUi", OP_ADDWHi>;
+def VSUBL_HIGH   : SOpInst<"vsubl_high", "(>Q)QQ", "csiUcUsUi", OP_SUBLHi>;
+def VSUBW_HIGH   : SOpInst<"vsubw_high", "(>Q)(>Q)Q", "csiUcUsUi", OP_SUBWHi>;
 
-def VABDL_HIGH   : SOpInst<"vabdl_high", "wkk",  "csiUcUsUi", OP_ABDLHi>;
-def VABAL_HIGH   : SOpInst<"vabal_high", "wwkk", "csiUcUsUi", OP_ABALHi>;
+def VABDL_HIGH   : SOpInst<"vabdl_high", "(>Q)QQ",  "csiUcUsUi", OP_ABDLHi>;
+def VABAL_HIGH   : SOpInst<"vabal_high", "(>Q)(>Q)QQ", "csiUcUsUi", OP_ABALHi>;
 
-def VMULL_HIGH   : SOpInst<"vmull_high", "wkk", "csiUcUsUiPc", OP_MULLHi>;
-def VMULL_HIGH_N : SOpInst<"vmull_high_n", "wks", "siUsUi", OP_MULLHi_N>;
-def VMLAL_HIGH   : SOpInst<"vmlal_high", "wwkk", "csiUcUsUi", OP_MLALHi>;
-def VMLAL_HIGH_N : SOpInst<"vmlal_high_n", "wwks", "siUsUi", OP_MLALHi_N>;
-def VMLSL_HIGH   : SOpInst<"vmlsl_high", "wwkk", "csiUcUsUi", OP_MLSLHi>;
-def VMLSL_HIGH_N : SOpInst<"vmlsl_high_n", "wwks", "siUsUi", OP_MLSLHi_N>;
+def VMULL_HIGH   : SOpInst<"vmull_high", "(>Q)QQ", "csiUcUsUiPc", OP_MULLHi>;
+def VMULL_HIGH_N : SOpInst<"vmull_high_n", "(>Q)Q1", "siUsUi", OP_MULLHi_N>;
+def VMLAL_HIGH   : SOpInst<"vmlal_high", "(>Q)(>Q)QQ", "csiUcUsUi", OP_MLALHi>;
+def VMLAL_HIGH_N : SOpInst<"vmlal_high_n", "(>Q)(>Q)Q1", "siUsUi", OP_MLALHi_N>;
+def VMLSL_HIGH   : SOpInst<"vmlsl_high", "(>Q)(>Q)QQ", "csiUcUsUi", OP_MLSLHi>;
+def VMLSL_HIGH_N : SOpInst<"vmlsl_high_n", "(>Q)(>Q)Q1", "siUsUi", OP_MLSLHi_N>;
 
-def VADDHN_HIGH  : SOpInst<"vaddhn_high", "qhkk", "silUsUiUl", OP_ADDHNHi>;
-def VRADDHN_HIGH : SOpInst<"vraddhn_high", "qhkk", "silUsUiUl", OP_RADDHNHi>;
-def VSUBHN_HIGH  : SOpInst<"vsubhn_high", "qhkk", "silUsUiUl", OP_SUBHNHi>;
-def VRSUBHN_HIGH : SOpInst<"vrsubhn_high", "qhkk", "silUsUiUl", OP_RSUBHNHi>;
+def VADDHN_HIGH  : SOpInst<"vaddhn_high", "(<Q)<QQ", "silUsUiUl", OP_ADDHNHi>;
+def VRADDHN_HIGH : SOpInst<"vraddhn_high", "(<Q)<QQ", "silUsUiUl", OP_RADDHNHi>;
+def VSUBHN_HIGH  : SOpInst<"vsubhn_high", "(<Q)<QQ", "silUsUiUl", OP_SUBHNHi>;
+def VRSUBHN_HIGH : SOpInst<"vrsubhn_high", "(<Q)<QQ", "silUsUiUl", OP_RSUBHNHi>;
 
-def VQDMULL_HIGH : SOpInst<"vqdmull_high", "wkk", "si", OP_QDMULLHi>;
-def VQDMULL_HIGH_N : SOpInst<"vqdmull_high_n", "wks", "si", OP_QDMULLHi_N>;
-def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>;
-def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "wwks", "si", OP_QDMLALHi_N>;
-def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>;
-def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "wwks", "si", OP_QDMLSLHi_N>;
-def VMULL_P64    : SInst<"vmull", "rss", "Pl">;
-def VMULL_HIGH_P64 : SOpInst<"vmull_high", "rdd", "HPl", OP_MULLHi_P64>;
+def VQDMULL_HIGH : SOpInst<"vqdmull_high", "(>Q)QQ", "si", OP_QDMULLHi>;
+def VQDMULL_HIGH_N : SOpInst<"vqdmull_high_n", "(>Q)Q1", "si", OP_QDMULLHi_N>;
+def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "(>Q)(>Q)QQ", "si", OP_QDMLALHi>;
+def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLALHi_N>;
+def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "(>Q)(>Q)QQ", "si", OP_QDMLSLHi>;
+def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "(>Q)(>Q)Q1", "si", OP_QDMLSLHi_N>;
+def VMULL_P64    : SInst<"vmull", "(1>)11", "Pl">;
+def VMULL_HIGH_P64 : SOpInst<"vmull_high", "(1>)..", "HPl", OP_MULLHi_P64>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector
-def GET_LANE : IInst<"vget_lane", "sdi", "dQdPlQPl">;
-def SET_LANE : IInst<"vset_lane", "dsdi", "dQdPlQPl">;
-def COPY_LANE : IOpInst<"vcopy_lane", "ddidi",
+def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl">;
+def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl">;
+def COPY_LANE : IOpInst<"vcopy_lane", "..I.I",
                         "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>;
-def COPYQ_LANE : IOpInst<"vcopy_lane", "ddigi",
+def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI",
                         "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
-def COPY_LANEQ : IOpInst<"vcopy_laneq", "ddiki",
+def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI",
                      "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>;
-def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "ddidi",
+def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I",
                      "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Set all lanes to same value
-def VDUP_LANE1: WOpInst<"vdup_lane", "dgi", "hdQhQdPlQPl", OP_DUP_LN>;
-def VDUP_LANE2: WOpInst<"vdup_laneq", "dji",
+def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>;
+def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI",
                   "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl",
                         OP_DUP_LN>;
-def DUP_N   : WOpInst<"vdup_n", "ds", "dQdPlQPl", OP_DUP>;
-def MOV_N   : WOpInst<"vmov_n", "ds", "dQdPlQPl", OP_DUP>;
+def DUP_N   : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>;
+def MOV_N   : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>;
 
 ////////////////////////////////////////////////////////////////////////////////
-def COMBINE : NoTestOpInst<"vcombine", "kdd", "dPl", OP_CONC>;
+def COMBINE : NoTestOpInst<"vcombine", "Q..", "dPl", OP_CONC>;
 
 ////////////////////////////////////////////////////////////////////////////////
 //Initialize a vector from bit pattern
-def CREATE : NoTestOpInst<"vcreate", "dl", "dPl", OP_CAST> {
+def CREATE : NoTestOpInst<"vcreate", ".(IU>)", "dPl", OP_CAST> {
   let BigEndianSafe = 1;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
-def VMLA_LANEQ   : IOpInst<"vmla_laneq", "dddji",
+def VMLA_LANEQ   : IOpInst<"vmla_laneq", "...QI",
                            "siUsUifQsQiQUsQUiQf", OP_MLA_LN>;
-def VMLS_LANEQ   : IOpInst<"vmls_laneq", "dddji",
+def VMLS_LANEQ   : IOpInst<"vmls_laneq", "...QI",
                            "siUsUifQsQiQUsQUiQf", OP_MLS_LN>;
 
-def VFMA_LANE    : IInst<"vfma_lane", "dddgi", "fdQfQd">;
-def VFMA_LANEQ   : IInst<"vfma_laneq", "dddji", "fdQfQd"> {
+def VFMA_LANE    : IInst<"vfma_lane", "...qI", "fdQfQd">;
+def VFMA_LANEQ   : IInst<"vfma_laneq", "...QI", "fdQfQd"> {
   let isLaneQ = 1;
 }
-def VFMS_LANE    : IOpInst<"vfms_lane", "dddgi", "fdQfQd", OP_FMS_LN>;
-def VFMS_LANEQ   : IOpInst<"vfms_laneq", "dddji", "fdQfQd", OP_FMS_LNQ>;
+def VFMS_LANE    : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>;
+def VFMS_LANEQ   : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>;
 
-def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "wwdki", "siUsUi", OP_MLAL_LN>;
-def VMLAL_HIGH_LANE   : SOpInst<"vmlal_high_lane", "wwkdi", "siUsUi",
+def VMLAL_LANEQ  : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>;
+def VMLAL_HIGH_LANE   : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLALHi_LN>;
-def VMLAL_HIGH_LANEQ  : SOpInst<"vmlal_high_laneq", "wwkki", "siUsUi",
+def VMLAL_HIGH_LANEQ  : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
                                 OP_MLALHi_LN>;
-def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "wwdki", "siUsUi", OP_MLSL_LN>;
-def VMLSL_HIGH_LANE   : SOpInst<"vmlsl_high_lane", "wwkdi", "siUsUi",
+def VMLSL_LANEQ  : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>;
+def VMLSL_HIGH_LANE   : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi",
                                 OP_MLSLHi_LN>;
-def VMLSL_HIGH_LANEQ  : SOpInst<"vmlsl_high_laneq", "wwkki", "siUsUi",
+def VMLSL_HIGH_LANEQ  : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi",
                                 OP_MLSLHi_LN>;
 
-def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "wwdki", "si", OP_QDMLAL_LN>;
-def VQDMLAL_HIGH_LANE   : SOpInst<"vqdmlal_high_lane", "wwkdi", "si",
+def VQDMLAL_LANEQ  : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>;
+def VQDMLAL_HIGH_LANE   : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLALHi_LN>;
-def VQDMLAL_HIGH_LANEQ  : SOpInst<"vqdmlal_high_laneq", "wwkki", "si",
+def VQDMLAL_HIGH_LANEQ  : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si",
                                 OP_QDMLALHi_LN>;
-def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "wwdki", "si", OP_QDMLSL_LN>;
-def VQDMLSL_HIGH_LANE   : SOpInst<"vqdmlsl_high_lane", "wwkdi", "si",
+def VQDMLSL_LANEQ  : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>;
+def VQDMLSL_HIGH_LANE   : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si",
                                 OP_QDMLSLHi_LN>;
-def VQDMLSL_HIGH_LANEQ  : SOpInst<"vqdmlsl_high_laneq", "wwkki", "si",
+def VQDMLSL_HIGH_LANEQ  : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si",
                                 OP_QDMLSLHi_LN>;
 
 // Newly add double parameter for vmul_lane in aarch64
 // Note: d type is handled by SCALAR_VMUL_LANE
-def VMUL_LANE_A64 : IOpInst<"vmul_lane", "ddgi", "Qd", OP_MUL_LN>;
+def VMUL_LANE_A64 : IOpInst<"vmul_lane", "..qI", "Qd", OP_MUL_LN>;
 
 // Note: d type is handled by SCALAR_VMUL_LANEQ
-def VMUL_LANEQ   : IOpInst<"vmul_laneq", "ddji",
+def VMUL_LANEQ   : IOpInst<"vmul_laneq", "..QI",
                            "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>;
-def VMULL_LANEQ  : SOpInst<"vmull_laneq", "wdki", "siUsUi", OP_MULL_LN>;
-def VMULL_HIGH_LANE   : SOpInst<"vmull_high_lane", "wkdi", "siUsUi",
+def VMULL_LANEQ  : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>;
+def VMULL_HIGH_LANE   : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi",
                                 OP_MULLHi_LN>;
-def VMULL_HIGH_LANEQ  : SOpInst<"vmull_high_laneq", "wkki", "siUsUi",
+def VMULL_HIGH_LANEQ  : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi",
                                 OP_MULLHi_LN>;
 
-def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "wdki", "si", OP_QDMULL_LN>;
-def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "wkdi", "si",
+def VQDMULL_LANEQ  : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>;
+def VQDMULL_HIGH_LANE   : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si",
                                   OP_QDMULLHi_LN>;
-def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "wkki", "si",
+def VQDMULL_HIGH_LANEQ  : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si",
                                   OP_QDMULLHi_LN>;
 
-def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "ddji", "siQsQi", OP_QDMULH_LN>;
-def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ddji", "siQsQi", OP_QRDMULH_LN>;
+def VQDMULH_LANEQ  : SOpInst<"vqdmulh_laneq", "..QI", "siQsQi", OP_QDMULH_LN>;
+def VQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "..QI", "siQsQi", OP_QRDMULH_LN>;
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
-def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "dddji", "siQsQi", OP_QRDMLAH_LN>;
-def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "dddji", "siQsQi", OP_QRDMLSH_LN>;
+def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>;
+def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>;
 }
 
 // Note: d type implemented by SCALAR_VMULX_LANE
-def VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "fQfQd", OP_MULX_LN>;
+def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>;
 // Note: d type is implemented by SCALAR_VMULX_LANEQ
-def VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "fQfQd", OP_MULX_LN>;
+def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Across vectors class
-def VADDLV  : SInst<"vaddlv", "rd", "csiUcUsUiQcQsQiQUcQUsQUi">;
-def VMAXV   : SInst<"vmaxv", "sd", "csifUcUsUiQcQsQiQUcQUsQUiQfQd">;
-def VMINV   : SInst<"vminv", "sd", "csifUcUsUiQcQsQiQUcQUsQUiQfQd">;
-def VADDV   : SInst<"vaddv", "sd", "csifUcUsUiQcQsQiQUcQUsQUiQfQdQlQUl">;
-def FMAXNMV : SInst<"vmaxnmv", "sd", "fQfQd">;
-def FMINNMV : SInst<"vminnmv", "sd", "fQfQd">;
+def VADDLV  : SInst<"vaddlv", "(1>).", "csiUcUsUiQcQsQiQUcQUsQUi">;
+def VMAXV   : SInst<"vmaxv", "1.", "csifUcUsUiQcQsQiQUcQUsQUiQfQd">;
+def VMINV   : SInst<"vminv", "1.", "csifUcUsUiQcQsQiQUcQUsQUiQfQd">;
+def VADDV   : SInst<"vaddv", "1.", "csifUcUsUiQcQsQiQUcQUsQUiQfQdQlQUl">;
+def FMAXNMV : SInst<"vmaxnmv", "1.", "fQfQd">;
+def FMINNMV : SInst<"vminnmv", "1.", "fQfQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Newly added Vector Extract for f64
-def VEXT_A64 : WInst<"vext", "dddi", "dQdPlQPl">;
+def VEXT_A64 : WInst<"vext", "...I", "dQdPlQPl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Crypto
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)" in {
-def AESE : SInst<"vaese", "ddd", "QUc">;
-def AESD : SInst<"vaesd", "ddd", "QUc">;
-def AESMC : SInst<"vaesmc", "dd", "QUc">;
-def AESIMC : SInst<"vaesimc", "dd", "QUc">;
-
-def SHA1H : SInst<"vsha1h", "ss", "Ui">;
-def SHA1SU1 : SInst<"vsha1su1", "ddd", "QUi">;
-def SHA256SU0 : SInst<"vsha256su0", "ddd", "QUi">;
-
-def SHA1C : SInst<"vsha1c", "ddsd", "QUi">;
-def SHA1P : SInst<"vsha1p", "ddsd", "QUi">;
-def SHA1M : SInst<"vsha1m", "ddsd", "QUi">;
-def SHA1SU0 : SInst<"vsha1su0", "dddd", "QUi">;
-def SHA256H : SInst<"vsha256h", "dddd", "QUi">;
-def SHA256H2 : SInst<"vsha256h2", "dddd", "QUi">;
-def SHA256SU1 : SInst<"vsha256su1", "dddd", "QUi">;
+def AESE : SInst<"vaese", "...", "QUc">;
+def AESD : SInst<"vaesd", "...", "QUc">;
+def AESMC : SInst<"vaesmc", "..", "QUc">;
+def AESIMC : SInst<"vaesimc", "..", "QUc">;
+
+def SHA1H : SInst<"vsha1h", "11", "Ui">;
+def SHA1SU1 : SInst<"vsha1su1", "...", "QUi">;
+def SHA256SU0 : SInst<"vsha256su0", "...", "QUi">;
+
+def SHA1C : SInst<"vsha1c", "..1.", "QUi">;
+def SHA1P : SInst<"vsha1p", "..1.", "QUi">;
+def SHA1M : SInst<"vsha1m", "..1.", "QUi">;
+def SHA1SU0 : SInst<"vsha1su0", "....", "QUi">;
+def SHA256H : SInst<"vsha256h", "....", "QUi">;
+def SHA256H2 : SInst<"vsha256h2", "....", "QUi">;
+def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Float -> Int conversions with explicit rounding mode
 
 let ArchGuard = "__ARM_ARCH >= 8" in {
-def FCVTNS_S32 : SInst<"vcvtn_s32", "xd", "fQf">;
-def FCVTNU_S32 : SInst<"vcvtn_u32", "ud", "fQf">;
-def FCVTPS_S32 : SInst<"vcvtp_s32", "xd", "fQf">;
-def FCVTPU_S32 : SInst<"vcvtp_u32", "ud", "fQf">;
-def FCVTMS_S32 : SInst<"vcvtm_s32", "xd", "fQf">;
-def FCVTMU_S32 : SInst<"vcvtm_u32", "ud", "fQf">;
-def FCVTAS_S32 : SInst<"vcvta_s32", "xd", "fQf">;
-def FCVTAU_S32 : SInst<"vcvta_u32", "ud", "fQf">;
+def FCVTNS_S32 : SInst<"vcvtn_s32", "S.", "fQf">;
+def FCVTNU_S32 : SInst<"vcvtn_u32", "U.", "fQf">;
+def FCVTPS_S32 : SInst<"vcvtp_s32", "S.", "fQf">;
+def FCVTPU_S32 : SInst<"vcvtp_u32", "U.", "fQf">;
+def FCVTMS_S32 : SInst<"vcvtm_s32", "S.", "fQf">;
+def FCVTMU_S32 : SInst<"vcvtm_u32", "U.", "fQf">;
+def FCVTAS_S32 : SInst<"vcvta_s32", "S.", "fQf">;
+def FCVTAU_S32 : SInst<"vcvta_u32", "U.", "fQf">;
 }
 
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__aarch64__)" in {
-def FCVTNS_S64 : SInst<"vcvtn_s64", "xd", "dQd">;
-def FCVTNU_S64 : SInst<"vcvtn_u64", "ud", "dQd">;
-def FCVTPS_S64 : SInst<"vcvtp_s64", "xd", "dQd">;
-def FCVTPU_S64 : SInst<"vcvtp_u64", "ud", "dQd">;
-def FCVTMS_S64 : SInst<"vcvtm_s64", "xd", "dQd">;
-def FCVTMU_S64 : SInst<"vcvtm_u64", "ud", "dQd">;
-def FCVTAS_S64 : SInst<"vcvta_s64", "xd", "dQd">;
-def FCVTAU_S64 : SInst<"vcvta_u64", "ud", "dQd">;
+def FCVTNS_S64 : SInst<"vcvtn_s64", "S.", "dQd">;
+def FCVTNU_S64 : SInst<"vcvtn_u64", "U.", "dQd">;
+def FCVTPS_S64 : SInst<"vcvtp_s64", "S.", "dQd">;
+def FCVTPU_S64 : SInst<"vcvtp_u64", "U.", "dQd">;
+def FCVTMS_S64 : SInst<"vcvtm_s64", "S.", "dQd">;
+def FCVTMU_S64 : SInst<"vcvtm_u64", "U.", "dQd">;
+def FCVTAS_S64 : SInst<"vcvta_s64", "S.", "dQd">;
+def FCVTAU_S64 : SInst<"vcvta_u64", "U.", "dQd">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Round to Integral
 
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in {
-def FRINTN_S32 : SInst<"vrndn", "dd", "fQf">;
-def FRINTA_S32 : SInst<"vrnda", "dd", "fQf">;
-def FRINTP_S32 : SInst<"vrndp", "dd", "fQf">;
-def FRINTM_S32 : SInst<"vrndm", "dd", "fQf">;
-def FRINTX_S32 : SInst<"vrndx", "dd", "fQf">;
-def FRINTZ_S32 : SInst<"vrnd", "dd", "fQf">;
-def FRINTI_S32 : SInst<"vrndi", "dd", "fQf">;
+def FRINTN_S32 : SInst<"vrndn", "..", "fQf">;
+def FRINTA_S32 : SInst<"vrnda", "..", "fQf">;
+def FRINTP_S32 : SInst<"vrndp", "..", "fQf">;
+def FRINTM_S32 : SInst<"vrndm", "..", "fQf">;
+def FRINTX_S32 : SInst<"vrndx", "..", "fQf">;
+def FRINTZ_S32 : SInst<"vrnd", "..", "fQf">;
+def FRINTI_S32 : SInst<"vrndi", "..", "fQf">;
 }
 
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in {
-def FRINTN_S64 : SInst<"vrndn", "dd", "dQd">;
-def FRINTA_S64 : SInst<"vrnda", "dd", "dQd">;
-def FRINTP_S64 : SInst<"vrndp", "dd", "dQd">;
-def FRINTM_S64 : SInst<"vrndm", "dd", "dQd">;
-def FRINTX_S64 : SInst<"vrndx", "dd", "dQd">;
-def FRINTZ_S64 : SInst<"vrnd", "dd", "dQd">;
-def FRINTI_S64 : SInst<"vrndi", "dd", "dQd">;
+def FRINTN_S64 : SInst<"vrndn", "..", "dQd">;
+def FRINTA_S64 : SInst<"vrnda", "..", "dQd">;
+def FRINTP_S64 : SInst<"vrndp", "..", "dQd">;
+def FRINTM_S64 : SInst<"vrndm", "..", "dQd">;
+def FRINTX_S64 : SInst<"vrndx", "..", "dQd">;
+def FRINTZ_S64 : SInst<"vrnd", "..", "dQd">;
+def FRINTI_S64 : SInst<"vrndi", "..", "dQd">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // MaxNum/MinNum Floating Point
 
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in {
-def FMAXNM_S32 : SInst<"vmaxnm", "ddd", "fQf">;
-def FMINNM_S32 : SInst<"vminnm", "ddd", "fQf">;
+def FMAXNM_S32 : SInst<"vmaxnm", "...", "fQf">;
+def FMINNM_S32 : SInst<"vminnm", "...", "fQf">;
 }
 
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__aarch64__) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in {
-def FMAXNM_S64 : SInst<"vmaxnm", "ddd", "dQd">;
-def FMINNM_S64 : SInst<"vminnm", "ddd", "dQd">;
+def FMAXNM_S64 : SInst<"vmaxnm", "...", "dQd">;
+def FMINNM_S64 : SInst<"vminnm", "...", "dQd">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Permutation
-def VTRN1 : SOpInst<"vtrn1", "ddd",
+def VTRN1 : SOpInst<"vtrn1", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN1>;
-def VZIP1 : SOpInst<"vzip1", "ddd",
+def VZIP1 : SOpInst<"vzip1", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP1>;
-def VUZP1 : SOpInst<"vuzp1", "ddd",
+def VUZP1 : SOpInst<"vuzp1", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP1>;
-def VTRN2 : SOpInst<"vtrn2", "ddd",
+def VTRN2 : SOpInst<"vtrn2", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN2>;
-def VZIP2 : SOpInst<"vzip2", "ddd",
+def VZIP2 : SOpInst<"vzip2", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP2>;
-def VUZP2 : SOpInst<"vuzp2", "ddd",
+def VUZP2 : SOpInst<"vuzp2", "...",
                     "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP2>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Table lookup
 let InstName = "vtbl" in {
-def VQTBL1_A64 : WInst<"vqtbl1", "dju",  "UccPcQUcQcQPc">;
-def VQTBL2_A64 : WInst<"vqtbl2", "dBu",  "UccPcQUcQcQPc">;
-def VQTBL3_A64 : WInst<"vqtbl3", "dCu",  "UccPcQUcQcQPc">;
-def VQTBL4_A64 : WInst<"vqtbl4", "dDu",  "UccPcQUcQcQPc">;
+def VQTBL1_A64 : WInst<"vqtbl1", ".QU",  "UccPcQUcQcQPc">;
+def VQTBL2_A64 : WInst<"vqtbl2", ".(2Q)U",  "UccPcQUcQcQPc">;
+def VQTBL3_A64 : WInst<"vqtbl3", ".(3Q)U",  "UccPcQUcQcQPc">;
+def VQTBL4_A64 : WInst<"vqtbl4", ".(4Q)U",  "UccPcQUcQcQPc">;
 }
 let InstName = "vtbx" in {
-def VQTBX1_A64 : WInst<"vqtbx1", "ddju", "UccPcQUcQcQPc">;
-def VQTBX2_A64 : WInst<"vqtbx2", "ddBu", "UccPcQUcQcQPc">;
-def VQTBX3_A64 : WInst<"vqtbx3", "ddCu", "UccPcQUcQcQPc">;
-def VQTBX4_A64 : WInst<"vqtbx4", "ddDu", "UccPcQUcQcQPc">;
+def VQTBX1_A64 : WInst<"vqtbx1", "..QU", "UccPcQUcQcQPc">;
+def VQTBX2_A64 : WInst<"vqtbx2", "..(2Q)U", "UccPcQUcQcQPc">;
+def VQTBX3_A64 : WInst<"vqtbx3", "..(3Q)U", "UccPcQUcQcQPc">;
+def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1095,7 +1096,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "ddDu", "UccPcQUcQcQPc">;
 // itself during generation so, unlike all other intrinsics, this one should
 // include *all* types, not just additional ones.
 def VVREINTERPRET
-  : NoTestOpInst<"vreinterpret", "dd",
+  : NoTestOpInst<"vreinterpret", "..",
        "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", OP_REINT> {
   let CartesianProductOfTypes = 1;
   let BigEndianSafe = 1;
@@ -1107,332 +1108,332 @@ def VVREINTERPRET
 // Scalar Arithmetic
 
 // Scalar Addition
-def SCALAR_ADD : SInst<"vadd", "sss",  "SlSUl">;
+def SCALAR_ADD : SInst<"vadd", "111",  "SlSUl">;
 // Scalar  Saturating Add
-def SCALAR_QADD   : SInst<"vqadd", "sss", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_QADD   : SInst<"vqadd", "111", "ScSsSiSlSUcSUsSUiSUl">;
 
 // Scalar Subtraction
-def SCALAR_SUB : SInst<"vsub", "sss",  "SlSUl">;
+def SCALAR_SUB : SInst<"vsub", "111",  "SlSUl">;
 // Scalar  Saturating Sub
-def SCALAR_QSUB   : SInst<"vqsub", "sss", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_QSUB   : SInst<"vqsub", "111", "ScSsSiSlSUcSUsSUiSUl">;
 
 let InstName = "vmov" in {
-def VGET_HIGH_A64 : NoTestOpInst<"vget_high", "dk", "dPl", OP_HI>;
-def VGET_LOW_A64  : NoTestOpInst<"vget_low", "dk", "dPl", OP_LO>;
+def VGET_HIGH_A64 : NoTestOpInst<"vget_high", ".Q", "dPl", OP_HI>;
+def VGET_LOW_A64  : NoTestOpInst<"vget_low", ".Q", "dPl", OP_LO>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Shift
 // Scalar Shift Left
-def SCALAR_SHL: SInst<"vshl", "sss", "SlSUl">;
+def SCALAR_SHL: SInst<"vshl", "111", "SlSUl">;
 // Scalar Saturating Shift Left
-def SCALAR_QSHL: SInst<"vqshl", "sss", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_QSHL: SInst<"vqshl", "111", "ScSsSiSlSUcSUsSUiSUl">;
 // Scalar Saturating Rounding Shift Left
-def SCALAR_QRSHL: SInst<"vqrshl", "sss", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_QRSHL: SInst<"vqrshl", "111", "ScSsSiSlSUcSUsSUiSUl">;
 // Scalar Shift Rounding Left
-def SCALAR_RSHL: SInst<"vrshl", "sss", "SlSUl">;
+def SCALAR_RSHL: SInst<"vrshl", "111", "SlSUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Shift (Immediate)
 let isScalarShift = 1 in {
 // Signed/Unsigned Shift Right (Immediate)
-def SCALAR_SSHR_N: SInst<"vshr_n", "ssi", "SlSUl">;
+def SCALAR_SSHR_N: SInst<"vshr_n", "11I", "SlSUl">;
 // Signed/Unsigned Rounding Shift Right (Immediate)
-def SCALAR_SRSHR_N: SInst<"vrshr_n", "ssi", "SlSUl">;
+def SCALAR_SRSHR_N: SInst<"vrshr_n", "11I", "SlSUl">;
 
 // Signed/Unsigned Shift Right and Accumulate (Immediate)
-def SCALAR_SSRA_N: SInst<"vsra_n", "sssi", "SlSUl">;
+def SCALAR_SSRA_N: SInst<"vsra_n", "111I", "SlSUl">;
 // Signed/Unsigned Rounding Shift Right and Accumulate (Immediate)
-def SCALAR_SRSRA_N: SInst<"vrsra_n", "sssi", "SlSUl">;
+def SCALAR_SRSRA_N: SInst<"vrsra_n", "111I", "SlSUl">;
 
 // Shift Left (Immediate)
-def SCALAR_SHL_N: SInst<"vshl_n", "ssi", "SlSUl">;
+def SCALAR_SHL_N: SInst<"vshl_n", "11I", "SlSUl">;
 // Signed/Unsigned Saturating Shift Left (Immediate)
-def SCALAR_SQSHL_N: SInst<"vqshl_n", "ssi", "ScSsSiSlSUcSUsSUiSUl">;
+def SCALAR_SQSHL_N: SInst<"vqshl_n", "11I", "ScSsSiSlSUcSUsSUiSUl">;
 // Signed Saturating Shift Left Unsigned (Immediate)
-def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "ssi", "ScSsSiSl">;
+def SCALAR_SQSHLU_N: SInst<"vqshlu_n", "11I", "ScSsSiSl">;
 
 // Shift Right And Insert (Immediate)
-def SCALAR_SRI_N: SInst<"vsri_n", "sssi", "SlSUl">;
+def SCALAR_SRI_N: SInst<"vsri_n", "111I", "SlSUl">;
 // Shift Left And Insert (Immediate)
-def SCALAR_SLI_N: SInst<"vsli_n", "sssi", "SlSUl">;
+def SCALAR_SLI_N: SInst<"vsli_n", "111I", "SlSUl">;
 
 let isScalarNarrowShift = 1 in {
   // Signed/Unsigned Saturating Shift Right Narrow (Immediate)
-  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "zsi", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQSHRN_N: SInst<"vqshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
   // Signed/Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "zsi", "SsSiSlSUsSUiSUl">;
+  def SCALAR_SQRSHRN_N: SInst<"vqrshrn_n", "(1<)1I", "SsSiSlSUsSUiSUl">;
   // Signed Saturating Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "zsi", "SsSiSl">;
+  def SCALAR_SQSHRUN_N: SInst<"vqshrun_n", "(1<)1I", "SsSiSl">;
   // Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "zsi", "SsSiSl">;
+  def SCALAR_SQRSHRUN_N: SInst<"vqrshrun_n", "(1<)1I", "SsSiSl">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed/Unsigned Fixed-point Convert To Floating-Point (Immediate)
-def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "ysi", "SiSUi">;
-def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "osi", "SlSUl">;
+def SCALAR_SCVTF_N_F32: SInst<"vcvt_n_f32", "(1F)(1!)I", "SiSUi">;
+def SCALAR_SCVTF_N_F64: SInst<"vcvt_n_f64", "(1F)(1!)I", "SlSUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Convert To Signed/Unsigned Fixed-point (Immediate)
-def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "$si", "Sf">;
-def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "bsi", "Sf">;
-def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "$si", "Sd">;
-def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "bsi", "Sd">;
+def SCALAR_FCVTZS_N_S32 : SInst<"vcvt_n_s32", "(1S)1I", "Sf">;
+def SCALAR_FCVTZU_N_U32 : SInst<"vcvt_n_u32", "(1U)1I", "Sf">;
+def SCALAR_FCVTZS_N_S64 : SInst<"vcvt_n_s64", "(1S)1I", "Sd">;
+def SCALAR_FCVTZU_N_U64 : SInst<"vcvt_n_u64", "(1U)1I", "Sd">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Round to Integral
 let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in {
-def SCALAR_FRINTN_S32 : SInst<"vrndn", "ss", "Sf">;
+def SCALAR_FRINTN_S32 : SInst<"vrndn", "11", "Sf">;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Reduce Pairwise Addition (Scalar and Floating Point)
-def SCALAR_ADDP  : SInst<"vpadd", "sd", "SfSHlSHdSHUl">;
+def SCALAR_ADDP  : SInst<"vpadd", "1.", "SfSHlSHdSHUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Reduce Floating Point Pairwise Max/Min
-def SCALAR_FMAXP : SInst<"vpmax", "sd", "SfSQd">;
+def SCALAR_FMAXP : SInst<"vpmax", "1.", "SfSQd">;
 
-def SCALAR_FMINP : SInst<"vpmin", "sd", "SfSQd">;
+def SCALAR_FMINP : SInst<"vpmin", "1.", "SfSQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Reduce Floating Point Pairwise maxNum/minNum
-def SCALAR_FMAXNMP : SInst<"vpmaxnm", "sd", "SfSQd">;
-def SCALAR_FMINNMP : SInst<"vpminnm", "sd", "SfSQd">;
+def SCALAR_FMAXNMP : SInst<"vpmaxnm", "1.", "SfSQd">;
+def SCALAR_FMINNMP : SInst<"vpminnm", "1.", "SfSQd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Integer Saturating Doubling Multiply Half High
-def SCALAR_SQDMULH : SInst<"vqdmulh", "sss", "SsSi">;
+def SCALAR_SQDMULH : SInst<"vqdmulh", "111", "SsSi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
-def SCALAR_SQRDMULH : SInst<"vqrdmulh", "sss", "SsSi">;
+def SCALAR_SQRDMULH : SInst<"vqrdmulh", "111", "SsSi">;
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
-def SCALAR_SQRDMLAH : SOpInst<"vqrdmlah", "ssss", "SsSi", OP_QRDMLAH>;
+def SCALAR_SQRDMLAH : SOpInst<"vqrdmlah", "1111", "SsSi", OP_QRDMLAH>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
-def SCALAR_SQRDMLSH : SOpInst<"vqrdmlsh", "ssss", "SsSi", OP_QRDMLSH>;
+def SCALAR_SQRDMLSH : SOpInst<"vqrdmlsh", "1111", "SsSi", OP_QRDMLSH>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Multiply Extended
-def SCALAR_FMULX : IInst<"vmulx", "sss", "SfSd">;
+def SCALAR_FMULX : IInst<"vmulx", "111", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Step
-def SCALAR_FRECPS : IInst<"vrecps", "sss", "SfSd">;
+def SCALAR_FRECPS : IInst<"vrecps", "111", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Square Root Step
-def SCALAR_FRSQRTS : IInst<"vrsqrts", "sss", "SfSd">;
+def SCALAR_FRSQRTS : IInst<"vrsqrts", "111", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Integer Convert To Floating-point
-def SCALAR_SCVTFS : SInst<"vcvt_f32", "ys", "Si">;
-def SCALAR_SCVTFD : SInst<"vcvt_f64", "os", "Sl">;
+def SCALAR_SCVTFS : SInst<"vcvt_f32", "(1F)(1!)", "Si">;
+def SCALAR_SCVTFD : SInst<"vcvt_f64", "(1F)(1!)", "Sl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Unsigned Integer Convert To Floating-point
-def SCALAR_UCVTFS : SInst<"vcvt_f32", "ys", "SUi">;
-def SCALAR_UCVTFD : SInst<"vcvt_f64", "os", "SUl">;
+def SCALAR_UCVTFS : SInst<"vcvt_f32", "(1F)(1!)", "SUi">;
+def SCALAR_UCVTFD : SInst<"vcvt_f64", "(1F)(1!)", "SUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Converts
-def SCALAR_FCVTXN  : IInst<"vcvtx_f32", "ys", "Sd">;
-def SCALAR_FCVTNSS : SInst<"vcvtn_s32", "$s", "Sf">;
-def SCALAR_FCVTNUS : SInst<"vcvtn_u32", "bs", "Sf">;
-def SCALAR_FCVTNSD : SInst<"vcvtn_s64", "$s", "Sd">;
-def SCALAR_FCVTNUD : SInst<"vcvtn_u64", "bs", "Sd">;
-def SCALAR_FCVTMSS : SInst<"vcvtm_s32", "$s", "Sf">;
-def SCALAR_FCVTMUS : SInst<"vcvtm_u32", "bs", "Sf">;
-def SCALAR_FCVTMSD : SInst<"vcvtm_s64", "$s", "Sd">;
-def SCALAR_FCVTMUD : SInst<"vcvtm_u64", "bs", "Sd">;
-def SCALAR_FCVTASS : SInst<"vcvta_s32", "$s", "Sf">;
-def SCALAR_FCVTAUS : SInst<"vcvta_u32", "bs", "Sf">;
-def SCALAR_FCVTASD : SInst<"vcvta_s64", "$s", "Sd">;
-def SCALAR_FCVTAUD : SInst<"vcvta_u64", "bs", "Sd">;
-def SCALAR_FCVTPSS : SInst<"vcvtp_s32", "$s", "Sf">;
-def SCALAR_FCVTPUS : SInst<"vcvtp_u32", "bs", "Sf">;
-def SCALAR_FCVTPSD : SInst<"vcvtp_s64", "$s", "Sd">;
-def SCALAR_FCVTPUD : SInst<"vcvtp_u64", "bs", "Sd">;
-def SCALAR_FCVTZSS : SInst<"vcvt_s32", "$s", "Sf">;
-def SCALAR_FCVTZUS : SInst<"vcvt_u32", "bs", "Sf">;
-def SCALAR_FCVTZSD : SInst<"vcvt_s64", "$s", "Sd">;
-def SCALAR_FCVTZUD : SInst<"vcvt_u64", "bs", "Sd">;
+def SCALAR_FCVTXN  : IInst<"vcvtx_f32", "(1F<)(1!)", "Sd">;
+def SCALAR_FCVTNSS : SInst<"vcvtn_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTNUS : SInst<"vcvtn_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTNSD : SInst<"vcvtn_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTNUD : SInst<"vcvtn_u64", "(1U)1", "Sd">;
+def SCALAR_FCVTMSS : SInst<"vcvtm_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTMUS : SInst<"vcvtm_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTMSD : SInst<"vcvtm_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTMUD : SInst<"vcvtm_u64", "(1U)1", "Sd">;
+def SCALAR_FCVTASS : SInst<"vcvta_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTAUS : SInst<"vcvta_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTASD : SInst<"vcvta_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTAUD : SInst<"vcvta_u64", "(1U)1", "Sd">;
+def SCALAR_FCVTPSS : SInst<"vcvtp_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTPUS : SInst<"vcvtp_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTPSD : SInst<"vcvtp_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTPUD : SInst<"vcvtp_u64", "(1U)1", "Sd">;
+def SCALAR_FCVTZSS : SInst<"vcvt_s32", "(1S)1", "Sf">;
+def SCALAR_FCVTZUS : SInst<"vcvt_u32", "(1U)1", "Sf">;
+def SCALAR_FCVTZSD : SInst<"vcvt_s64", "(1S)1", "Sd">;
+def SCALAR_FCVTZUD : SInst<"vcvt_u64", "(1U)1", "Sd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Estimate
-def SCALAR_FRECPE : IInst<"vrecpe", "ss", "SfSd">;
+def SCALAR_FRECPE : IInst<"vrecpe", "11", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Exponent
-def SCALAR_FRECPX : IInst<"vrecpx", "ss", "SfSd">;
+def SCALAR_FRECPX : IInst<"vrecpx", "11", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Reciprocal Square Root Estimate
-def SCALAR_FRSQRTE : IInst<"vrsqrte", "ss", "SfSd">;
+def SCALAR_FRSQRTE : IInst<"vrsqrte", "11", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Integer Comparison
-def SCALAR_CMEQ : SInst<"vceq", "sss", "SlSUl">;
-def SCALAR_CMEQZ : SInst<"vceqz", "ss", "SlSUl">;
-def SCALAR_CMGE : SInst<"vcge", "sss", "Sl">;
-def SCALAR_CMGEZ : SInst<"vcgez", "ss", "Sl">;
-def SCALAR_CMHS : SInst<"vcge", "sss", "SUl">;
-def SCALAR_CMLE : SInst<"vcle", "sss", "SlSUl">;
-def SCALAR_CMLEZ : SInst<"vclez", "ss", "Sl">;
-def SCALAR_CMLT : SInst<"vclt", "sss", "SlSUl">;
-def SCALAR_CMLTZ : SInst<"vcltz", "ss", "Sl">;
-def SCALAR_CMGT : SInst<"vcgt", "sss", "Sl">;
-def SCALAR_CMGTZ : SInst<"vcgtz", "ss", "Sl">;
-def SCALAR_CMHI : SInst<"vcgt", "sss", "SUl">;
-def SCALAR_CMTST : SInst<"vtst", "sss", "SlSUl">;
+def SCALAR_CMEQ : SInst<"vceq", "111", "SlSUl">;
+def SCALAR_CMEQZ : SInst<"vceqz", "11", "SlSUl">;
+def SCALAR_CMGE : SInst<"vcge", "111", "Sl">;
+def SCALAR_CMGEZ : SInst<"vcgez", "11", "Sl">;
+def SCALAR_CMHS : SInst<"vcge", "111", "SUl">;
+def SCALAR_CMLE : SInst<"vcle", "111", "SlSUl">;
+def SCALAR_CMLEZ : SInst<"vclez", "11", "Sl">;
+def SCALAR_CMLT : SInst<"vclt", "111", "SlSUl">;
+def SCALAR_CMLTZ : SInst<"vcltz", "11", "Sl">;
+def SCALAR_CMGT : SInst<"vcgt", "111", "Sl">;
+def SCALAR_CMGTZ : SInst<"vcgtz", "11", "Sl">;
+def SCALAR_CMHI : SInst<"vcgt", "111", "SUl">;
+def SCALAR_CMTST : SInst<"vtst", "111", "SlSUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Comparison
-def SCALAR_FCMEQ : IInst<"vceq", "bss", "SfSd">;
-def SCALAR_FCMEQZ : IInst<"vceqz", "bs", "SfSd">;
-def SCALAR_FCMGE : IInst<"vcge", "bss", "SfSd">;
-def SCALAR_FCMGEZ : IInst<"vcgez", "bs", "SfSd">;
-def SCALAR_FCMGT : IInst<"vcgt", "bss", "SfSd">;
-def SCALAR_FCMGTZ : IInst<"vcgtz", "bs", "SfSd">;
-def SCALAR_FCMLE : IInst<"vcle", "bss", "SfSd">;
-def SCALAR_FCMLEZ : IInst<"vclez", "bs", "SfSd">;
-def SCALAR_FCMLT : IInst<"vclt", "bss", "SfSd">;
-def SCALAR_FCMLTZ : IInst<"vcltz", "bs", "SfSd">;
+def SCALAR_FCMEQ : IInst<"vceq", "(1U)11", "SfSd">;
+def SCALAR_FCMEQZ : IInst<"vceqz", "(1U)1", "SfSd">;
+def SCALAR_FCMGE : IInst<"vcge", "(1U)11", "SfSd">;
+def SCALAR_FCMGEZ : IInst<"vcgez", "(1U)1", "SfSd">;
+def SCALAR_FCMGT : IInst<"vcgt", "(1U)11", "SfSd">;
+def SCALAR_FCMGTZ : IInst<"vcgtz", "(1U)1", "SfSd">;
+def SCALAR_FCMLE : IInst<"vcle", "(1U)11", "SfSd">;
+def SCALAR_FCMLEZ : IInst<"vclez", "(1U)1", "SfSd">;
+def SCALAR_FCMLT : IInst<"vclt", "(1U)11", "SfSd">;
+def SCALAR_FCMLTZ : IInst<"vcltz", "(1U)1", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
-def SCALAR_FACGE : IInst<"vcage", "bss", "SfSd">;
-def SCALAR_FACLE : IInst<"vcale", "bss", "SfSd">;
+def SCALAR_FACGE : IInst<"vcage", "(1U)11", "SfSd">;
+def SCALAR_FACLE : IInst<"vcale", "(1U)11", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Absolute Compare Mask Greater Than
-def SCALAR_FACGT : IInst<"vcagt", "bss", "SfSd">;
-def SCALAR_FACLT : IInst<"vcalt", "bss", "SfSd">;
+def SCALAR_FACGT : IInst<"vcagt", "(1U)11", "SfSd">;
+def SCALAR_FACLT : IInst<"vcalt", "(1U)11", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Absolute Value
-def SCALAR_ABS : SInst<"vabs", "ss", "Sl">;
+def SCALAR_ABS : SInst<"vabs", "11", "Sl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Absolute Difference
-def SCALAR_ABD : IInst<"vabd", "sss", "SfSd">;
+def SCALAR_ABD : IInst<"vabd", "111", "SfSd">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Saturating Absolute Value
-def SCALAR_SQABS : SInst<"vqabs", "ss", "ScSsSiSl">;
+def SCALAR_SQABS : SInst<"vqabs", "11", "ScSsSiSl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Negate
-def SCALAR_NEG : SInst<"vneg", "ss", "Sl">;
+def SCALAR_NEG : SInst<"vneg", "11", "Sl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Saturating Negate
-def SCALAR_SQNEG : SInst<"vqneg", "ss", "ScSsSiSl">;
+def SCALAR_SQNEG : SInst<"vqneg", "11", "ScSsSiSl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Saturating Accumulated of Unsigned Value
-def SCALAR_SUQADD : SInst<"vuqadd", "ssb", "ScSsSiSl">;
+def SCALAR_SUQADD : SInst<"vuqadd", "11(1U)", "ScSsSiSl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Unsigned Saturating Accumulated of Signed Value
-def SCALAR_USQADD : SInst<"vsqadd", "ss$", "SUcSUsSUiSUl">;
+def SCALAR_USQADD : SInst<"vsqadd", "11(1S)", "SUcSUsSUiSUl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Doubling Multiply-Add Long
-def SCALAR_SQDMLAL : SInst<"vqdmlal", "rrss", "SsSi">;
+def SCALAR_SQDMLAL : SInst<"vqdmlal", "(1>)(1>)11", "SsSi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Doubling Multiply-Subtract Long
-def SCALAR_SQDMLSL : SInst<"vqdmlsl", "rrss", "SsSi">;
+def SCALAR_SQDMLSL : SInst<"vqdmlsl", "(1>)(1>)11", "SsSi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Doubling Multiply Long
-def SCALAR_SQDMULL : SInst<"vqdmull", "rss", "SsSi">;
+def SCALAR_SQDMULL : SInst<"vqdmull", "(1>)11", "SsSi">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Saturating Extract Unsigned Narrow
-def SCALAR_SQXTUN : SInst<"vqmovun", "zs", "SsSiSl">;
+def SCALAR_SQXTUN : SInst<"vqmovun", "(1<)1", "SsSiSl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Signed Saturating Extract Narrow
-def SCALAR_SQXTN : SInst<"vqmovn", "zs", "SsSiSl">;
+def SCALAR_SQXTN : SInst<"vqmovn", "(1<)1", "SsSiSl">;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Unsigned Saturating Extract Narrow
-def SCALAR_UQXTN : SInst<"vqmovn", "zs", "SUsSUiSUl">;
+def SCALAR_UQXTN : SInst<"vqmovn", "(1<)1", "SUsSUiSUl">;
 
 // Scalar Floating Point  multiply (scalar, by element)
-def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "ssdi", "SfSd", OP_SCALAR_MUL_LN>;
-def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "ssji", "SfSd", OP_SCALAR_MUL_LN>;
+def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>;
+def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>;
 
 // Scalar Floating Point  multiply extended (scalar, by element)
-def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "ssdi", "SfSd", OP_SCALAR_MULX_LN>;
-def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "ssji", "SfSd", OP_SCALAR_MULX_LN>;
+def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>;
+def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>;
 
-def SCALAR_VMUL_N : IInst<"vmul_n", "dds", "d">;
+def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">;
 
 // VMUL_LANE_A64 d type implemented using scalar mul lane
-def SCALAR_VMUL_LANE : IInst<"vmul_lane", "ddgi", "d">;
+def SCALAR_VMUL_LANE : IInst<"vmul_lane", "..qI", "d">;
 
 // VMUL_LANEQ d type implemented using scalar mul lane
-def SCALAR_VMUL_LANEQ   : IInst<"vmul_laneq", "ddji", "d"> {
+def SCALAR_VMUL_LANEQ   : IInst<"vmul_laneq", "..QI", "d"> {
   let isLaneQ = 1;
 }
 
 // VMULX_LANE d type implemented using scalar vmulx_lane
-def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "ddgi", "d", OP_SCALAR_VMULX_LN>;
+def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>;
 
 // VMULX_LANEQ d type implemented using scalar vmulx_laneq
-def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "ddji", "d", OP_SCALAR_VMULX_LNQ>;
+def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>;
 
 // Scalar Floating Point fused multiply-add (scalar, by element)
-def SCALAR_FMLA_LANE : IInst<"vfma_lane", "sssdi", "SfSd">;
-def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "sssji", "SfSd">;
+def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">;
+def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd">;
 
 // Scalar Floating Point fused multiply-subtract (scalar, by element)
-def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "sssdi", "SfSd", OP_FMS_LN>;
-def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "sssji", "SfSd", OP_FMS_LNQ>;
+def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>;
+def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>;
 
 // Signed Saturating Doubling Multiply Long (scalar by element)
-def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "rsdi", "SsSi", OP_SCALAR_QDMULL_LN>;
-def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "rsji", "SsSi", OP_SCALAR_QDMULL_LN>;
+def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>;
+def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>;
 
 // Signed Saturating Doubling Multiply-Add Long (scalar by element)
-def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "rrsdi", "SsSi">;
-def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "rrsji", "SsSi">;
+def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">;
+def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi">;
 
 // Signed Saturating Doubling Multiply-Subtract Long (scalar by element)
-def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "rrsdi", "SsSi">;
-def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "rrsji", "SsSi">;
+def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">;
+def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi">;
 
 // Scalar Integer Saturating Doubling Multiply Half High (scalar by element)
-def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "ssdi", "SsSi", OP_SCALAR_QDMULH_LN>;
-def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QDMULH_LN>;
+def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>;
+def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>;
 
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
-def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "ssdi", "SsSi", OP_SCALAR_QRDMULH_LN>;
-def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "ssji", "SsSi", OP_SCALAR_QRDMULH_LN>;
+def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>;
+def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>;
 
 let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in {
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
-def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "sssdi", "SsSi", OP_SCALAR_QRDMLAH_LN>;
-def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "sssji", "SsSi", OP_SCALAR_QRDMLAH_LN>;
+def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>;
+def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>;
 
 // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
-def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "sssdi", "SsSi", OP_SCALAR_QRDMLSH_LN>;
-def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "sssji", "SsSi", OP_SCALAR_QRDMLSH_LN>;
+def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>;
+def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>;
 }
 
-def SCALAR_VDUP_LANE : IInst<"vdup_lane", "sdi", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
-def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "sji", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
+def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
+def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">;
 }
 
 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
@@ -1441,234 +1442,252 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
   // ARMv8.2-A FP16 one-operand vector intrinsics.
 
   // Comparison
-  def CMEQH    : SInst<"vceqz", "ud", "hQh">;
-  def CMGEH    : SInst<"vcgez", "ud", "hQh">;
-  def CMGTH    : SInst<"vcgtz", "ud", "hQh">;
-  def CMLEH    : SInst<"vclez", "ud", "hQh">;
-  def CMLTH    : SInst<"vcltz", "ud", "hQh">;
+  def CMEQH    : SInst<"vceqz", "U.", "hQh">;
+  def CMGEH    : SInst<"vcgez", "U.", "hQh">;
+  def CMGTH    : SInst<"vcgtz", "U.", "hQh">;
+  def CMLEH    : SInst<"vclez", "U.", "hQh">;
+  def CMLTH    : SInst<"vcltz", "U.", "hQh">;
 
   // Vector conversion
-  def VCVT_F16     : SInst<"vcvt_f16", "Hd",  "sUsQsQUs">;
-  def VCVT_S16     : SInst<"vcvt_s16", "xd",  "hQh">;
-  def VCVT_U16     : SInst<"vcvt_u16", "ud",  "hQh">;
-  def VCVTA_S16    : SInst<"vcvta_s16", "xd", "hQh">;
-  def VCVTA_U16    : SInst<"vcvta_u16", "ud", "hQh">;
-  def VCVTM_S16    : SInst<"vcvtm_s16", "xd", "hQh">;
-  def VCVTM_U16    : SInst<"vcvtm_u16", "ud", "hQh">;
-  def VCVTN_S16    : SInst<"vcvtn_s16", "xd", "hQh">;
-  def VCVTN_U16    : SInst<"vcvtn_u16", "ud", "hQh">;
-  def VCVTP_S16    : SInst<"vcvtp_s16", "xd", "hQh">;
-  def VCVTP_U16    : SInst<"vcvtp_u16", "ud", "hQh">;
+  def VCVT_F16     : SInst<"vcvt_f16", "F(.!)",  "sUsQsQUs">;
+  def VCVT_S16     : SInst<"vcvt_s16", "S.",  "hQh">;
+  def VCVT_U16     : SInst<"vcvt_u16", "U.",  "hQh">;
+  def VCVTA_S16    : SInst<"vcvta_s16", "S.", "hQh">;
+  def VCVTA_U16    : SInst<"vcvta_u16", "U.", "hQh">;
+  def VCVTM_S16    : SInst<"vcvtm_s16", "S.", "hQh">;
+  def VCVTM_U16    : SInst<"vcvtm_u16", "U.", "hQh">;
+  def VCVTN_S16    : SInst<"vcvtn_s16", "S.", "hQh">;
+  def VCVTN_U16    : SInst<"vcvtn_u16", "U.", "hQh">;
+  def VCVTP_S16    : SInst<"vcvtp_s16", "S.", "hQh">;
+  def VCVTP_U16    : SInst<"vcvtp_u16", "U.", "hQh">;
 
   // Vector rounding
   let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
-    def FRINTZH      : SInst<"vrnd",  "dd", "hQh">;
-    def FRINTNH      : SInst<"vrndn", "dd", "hQh">;
-    def FRINTAH      : SInst<"vrnda", "dd", "hQh">;
-    def FRINTPH      : SInst<"vrndp", "dd", "hQh">;
-    def FRINTMH      : SInst<"vrndm", "dd", "hQh">;
-    def FRINTXH      : SInst<"vrndx", "dd", "hQh">;
+    def FRINTZH      : SInst<"vrnd",  "..", "hQh">;
+    def FRINTNH      : SInst<"vrndn", "..", "hQh">;
+    def FRINTAH      : SInst<"vrnda", "..", "hQh">;
+    def FRINTPH      : SInst<"vrndp", "..", "hQh">;
+    def FRINTMH      : SInst<"vrndm", "..", "hQh">;
+    def FRINTXH      : SInst<"vrndx", "..", "hQh">;
   }
 
   // Misc.
-  def VABSH        : SInst<"vabs", "dd", "hQh">;
-  def VNEGH        : SOpInst<"vneg", "dd", "hQh", OP_NEG>;
-  def VRECPEH      : SInst<"vrecpe", "dd", "hQh">;
-  def FRSQRTEH     : SInst<"vrsqrte", "dd", "hQh">;
+  def VABSH        : SInst<"vabs", "..", "hQh">;
+  def VNEGH        : SOpInst<"vneg", "..", "hQh", OP_NEG>;
+  def VRECPEH      : SInst<"vrecpe", "..", "hQh">;
+  def FRSQRTEH     : SInst<"vrsqrte", "..", "hQh">;
 
   // ARMv8.2-A FP16 two-operands vector intrinsics.
 
   // Misc.
-  def VADDH        : SOpInst<"vadd", "ddd", "hQh", OP_ADD>;
-  def VABDH        : SInst<"vabd", "ddd",  "hQh">;
-  def VSUBH         : SOpInst<"vsub", "ddd", "hQh", OP_SUB>;
+  def VADDH        : SOpInst<"vadd", "...", "hQh", OP_ADD>;
+  def VABDH        : SInst<"vabd", "...",  "hQh">;
+  def VSUBH         : SOpInst<"vsub", "...", "hQh", OP_SUB>;
 
   // Comparison
   let InstName = "vacge" in {
-    def VCAGEH     : SInst<"vcage", "udd", "hQh">;
-    def VCALEH     : SInst<"vcale", "udd", "hQh">;
+    def VCAGEH     : SInst<"vcage", "U..", "hQh">;
+    def VCALEH     : SInst<"vcale", "U..", "hQh">;
   }
   let InstName = "vacgt" in {
-    def VCAGTH     : SInst<"vcagt", "udd", "hQh">;
-    def VCALTH     : SInst<"vcalt", "udd", "hQh">;
+    def VCAGTH     : SInst<"vcagt", "U..", "hQh">;
+    def VCALTH     : SInst<"vcalt", "U..", "hQh">;
   }
-  def VCEQH        : SOpInst<"vceq", "udd", "hQh", OP_EQ>;
-  def VCGEH        : SOpInst<"vcge", "udd", "hQh", OP_GE>;
-  def VCGTH        : SOpInst<"vcgt", "udd", "hQh", OP_GT>;
+  def VCEQH        : SOpInst<"vceq", "U..", "hQh", OP_EQ>;
+  def VCGEH        : SOpInst<"vcge", "U..", "hQh", OP_GE>;
+  def VCGTH        : SOpInst<"vcgt", "U..", "hQh", OP_GT>;
   let InstName = "vcge" in
-    def VCLEH      : SOpInst<"vcle", "udd", "hQh", OP_LE>;
+    def VCLEH      : SOpInst<"vcle", "U..", "hQh", OP_LE>;
   let InstName = "vcgt" in
-    def VCLTH      : SOpInst<"vclt", "udd", "hQh", OP_LT>;
+    def VCLTH      : SOpInst<"vclt", "U..", "hQh", OP_LT>;
 
   // Vector conversion
   let isVCVT_N = 1 in {
-    def VCVT_N_F16 : SInst<"vcvt_n_f16", "Hdi", "sUsQsQUs">;
-    def VCVT_N_S16 : SInst<"vcvt_n_s16", "xdi", "hQh">;
-    def VCVT_N_U16 : SInst<"vcvt_n_u16", "udi", "hQh">;
+    def VCVT_N_F16 : SInst<"vcvt_n_f16", "F(.!)I", "sUsQsQUs">;
+    def VCVT_N_S16 : SInst<"vcvt_n_s16", "S.I", "hQh">;
+    def VCVT_N_U16 : SInst<"vcvt_n_u16", "U.I", "hQh">;
   }
 
   // Max/Min
-  def VMAXH         : SInst<"vmax", "ddd", "hQh">;
-  def VMINH         : SInst<"vmin", "ddd", "hQh">;
+  def VMAXH         : SInst<"vmax", "...", "hQh">;
+  def VMINH         : SInst<"vmin", "...", "hQh">;
   let ArchGuard = "__ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
-    def FMAXNMH       : SInst<"vmaxnm", "ddd", "hQh">;
-    def FMINNMH       : SInst<"vminnm", "ddd", "hQh">;
+    def FMAXNMH       : SInst<"vmaxnm", "...", "hQh">;
+    def FMINNMH       : SInst<"vminnm", "...", "hQh">;
   }
 
   // Multiplication/Division
-  def VMULH         : SOpInst<"vmul", "ddd", "hQh", OP_MUL>;
+  def VMULH         : SOpInst<"vmul", "...", "hQh", OP_MUL>;
 
   // Pairwise addition
-  def VPADDH        : SInst<"vpadd", "ddd", "h">;
+  def VPADDH        : SInst<"vpadd", "...", "h">;
 
   // Pairwise Max/Min
-  def VPMAXH        : SInst<"vpmax", "ddd", "h">;
-  def VPMINH        : SInst<"vpmin", "ddd", "h">;
+  def VPMAXH        : SInst<"vpmax", "...", "h">;
+  def VPMINH        : SInst<"vpmin", "...", "h">;
 
   // Reciprocal/Sqrt
-  def VRECPSH       : SInst<"vrecps", "ddd", "hQh">;
-  def VRSQRTSH      : SInst<"vrsqrts", "ddd", "hQh">;
+  def VRECPSH       : SInst<"vrecps", "...", "hQh">;
+  def VRSQRTSH      : SInst<"vrsqrts", "...", "hQh">;
 
   // ARMv8.2-A FP16 three-operands vector intrinsics.
 
   // Vector fused multiply-add operations
-  def VFMAH        : SInst<"vfma", "dddd", "hQh">;
-  def VFMSH        : SOpInst<"vfms", "dddd", "hQh", OP_FMLS>;
+  def VFMAH        : SInst<"vfma", "....", "hQh">;
+  def VFMSH        : SOpInst<"vfms", "....", "hQh", OP_FMLS>;
 
   // ARMv8.2-A FP16 lane vector intrinsics.
 
   // Mul lane
-  def VMUL_LANEH    : IOpInst<"vmul_lane", "ddgi", "hQh", OP_MUL_LN>;
-  def VMUL_NH       : IOpInst<"vmul_n", "dds", "hQh", OP_MUL_N>;
+  def VMUL_LANEH    : IOpInst<"vmul_lane", "..qI", "hQh", OP_MUL_LN>;
+  def VMUL_NH       : IOpInst<"vmul_n", "..1", "hQh", OP_MUL_N>;
 
   // Data processing intrinsics - section 5
 
   // Logical operations
   let isHiddenLInst = 1 in
-  def VBSLH    : SInst<"vbsl", "dudd", "hQh">;
+  def VBSLH    : SInst<"vbsl", ".U..", "hQh">;
 
   // Transposition operations
-  def VZIPH    : WInst<"vzip", "2dd", "hQh">;
-  def VUZPH    : WInst<"vuzp", "2dd", "hQh">;
-  def VTRNH    : WInst<"vtrn", "2dd", "hQh">;
+  def VZIPH    : WInst<"vzip", "2..", "hQh">;
+  def VUZPH    : WInst<"vuzp", "2..", "hQh">;
+  def VTRNH    : WInst<"vtrn", "2..", "hQh">;
 
 
   let ArchGuard = "!defined(__aarch64__)" in {
     // Set all lanes to same value.
     // Already implemented prior to ARMv8.2-A.
-    def VMOV_NH  : WOpInst<"vmov_n", "ds", "hQh", OP_DUP>;
-    def VDUP_NH  : WOpInst<"vdup_n", "ds", "hQh", OP_DUP>;
-    def VDUP_LANE1H : WOpInst<"vdup_lane", "dgi", "hQh", OP_DUP_LN>;
+    def VMOV_NH  : WOpInst<"vmov_n", ".1", "hQh", OP_DUP>;
+    def VDUP_NH  : WOpInst<"vdup_n", ".1", "hQh", OP_DUP>;
+    def VDUP_LANE1H : WOpInst<"vdup_lane", ".qI", "hQh", OP_DUP_LN>;
   }
 
   // Vector Extract
-  def VEXTH      : WInst<"vext", "dddi", "hQh">;
+  def VEXTH      : WInst<"vext", "...I", "hQh">;
 
   // Reverse vector elements
-  def VREV64H    : WOpInst<"vrev64", "dd", "hQh", OP_REV64>;
+  def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 }
 
 // ARMv8.2-A FP16 vector intrinsics for A64 only.
 let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__)" in {
 
   // Vector rounding
-  def FRINTIH      : SInst<"vrndi", "dd", "hQh">;
+  def FRINTIH      : SInst<"vrndi", "..", "hQh">;
 
   // Misc.
-  def FSQRTH       : SInst<"vsqrt", "dd", "hQh">;
+  def FSQRTH       : SInst<"vsqrt", "..", "hQh">;
 
   // Multiplication/Division
-  def MULXH         : SInst<"vmulx", "ddd", "hQh">;
-  def FDIVH         : IOpInst<"vdiv", "ddd",  "hQh", OP_DIV>;
+  def MULXH         : SInst<"vmulx", "...", "hQh">;
+  def FDIVH         : IOpInst<"vdiv", "...",  "hQh", OP_DIV>;
 
   // Pairwise addition
-  def VPADDH1       : SInst<"vpadd", "ddd", "Qh">;
+  def VPADDH1       : SInst<"vpadd", "...", "Qh">;
 
   // Pairwise Max/Min
-  def VPMAXH1       : SInst<"vpmax", "ddd", "Qh">;
-  def VPMINH1       : SInst<"vpmin", "ddd", "Qh">;
+  def VPMAXH1       : SInst<"vpmax", "...", "Qh">;
+  def VPMINH1       : SInst<"vpmin", "...", "Qh">;
 
   // Pairwise MaxNum/MinNum
-  def FMAXNMPH      : SInst<"vpmaxnm", "ddd", "hQh">;
-  def FMINNMPH      : SInst<"vpminnm", "ddd", "hQh">;
+  def FMAXNMPH      : SInst<"vpmaxnm", "...", "hQh">;
+  def FMINNMPH      : SInst<"vpminnm", "...", "hQh">;
 
   // ARMv8.2-A FP16 lane vector intrinsics.
 
   // FMA lane
-  def VFMA_LANEH   : IInst<"vfma_lane", "dddgi", "hQh">;
-  def VFMA_LANEQH  : IInst<"vfma_laneq", "dddji", "hQh">;
+  def VFMA_LANEH   : IInst<"vfma_lane", "...qI", "hQh">;
+  def VFMA_LANEQH  : IInst<"vfma_laneq", "...QI", "hQh">;
 
   // FMA lane with scalar argument
-  def FMLA_NH      : SOpInst<"vfma_n", "ddds", "hQh", OP_FMLA_N>;
+  def FMLA_NH      : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>;
   // Scalar floating point fused multiply-add (scalar, by element)
-  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "sssdi", "Sh">;
-  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "sssji", "Sh">;
+  def SCALAR_FMLA_LANEH  : IInst<"vfma_lane", "111.I", "Sh">;
+  def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh">;
 
   // FMS lane
-  def VFMS_LANEH   : IOpInst<"vfms_lane", "dddgi", "hQh", OP_FMS_LN>;
-  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "dddji", "hQh", OP_FMS_LNQ>;
+  def VFMS_LANEH   : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>;
+  def VFMS_LANEQH  : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>;
   // FMS lane with scalar argument
-  def FMLS_NH      : SOpInst<"vfms_n", "ddds", "hQh", OP_FMLS_N>;
+  def FMLS_NH      : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>;
   // Scalar floating foint fused multiply-subtract (scalar, by element)
-  def SCALAR_FMLS_LANEH  : IOpInst<"vfms_lane", "sssdi", "Sh", OP_FMS_LN>;
-  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "sssji", "Sh", OP_FMS_LNQ>;
+  def SCALAR_FMLS_LANEH  : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>;
+  def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>;
 
   // Mul lane
-  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "ddji", "hQh", OP_MUL_LN>;
+  def VMUL_LANEQH   : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>;
   // Scalar floating point  multiply (scalar, by element)
-  def SCALAR_FMUL_LANEH  : IOpInst<"vmul_lane", "ssdi", "Sh", OP_SCALAR_MUL_LN>;
-  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "ssji", "Sh", OP_SCALAR_MUL_LN>;
+  def SCALAR_FMUL_LANEH  : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>;
+  def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>;
 
   // Mulx lane
-  def VMULX_LANEH   : IOpInst<"vmulx_lane", "ddgi", "hQh", OP_MULX_LN>;
-  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "ddji", "hQh", OP_MULX_LN>;
-  def VMULX_NH      : IOpInst<"vmulx_n", "dds", "hQh", OP_MULX_N>;
+  def VMULX_LANEH   : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>;
+  def VMULX_LANEQH  : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>;
+  def VMULX_NH      : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>;
   // Scalar floating point  mulx (scalar, by element)
-  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "ssdi", "Sh">;
-  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "ssji", "Sh">;
+  def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">;
+  def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh">;
 
   // ARMv8.2-A FP16 reduction vector intrinsics.
-  def VMAXVH   : SInst<"vmaxv", "sd", "hQh">;
-  def VMINVH   : SInst<"vminv", "sd", "hQh">;
-  def FMAXNMVH : SInst<"vmaxnmv", "sd", "hQh">;
-  def FMINNMVH : SInst<"vminnmv", "sd", "hQh">;
+  def VMAXVH   : SInst<"vmaxv", "1.", "hQh">;
+  def VMINVH   : SInst<"vminv", "1.", "hQh">;
+  def FMAXNMVH : SInst<"vmaxnmv", "1.", "hQh">;
+  def FMINNMVH : SInst<"vminnmv", "1.", "hQh">;
 
   // Permutation
-  def VTRN1H     : SOpInst<"vtrn1", "ddd", "hQh", OP_TRN1>;
-  def VZIP1H     : SOpInst<"vzip1", "ddd", "hQh", OP_ZIP1>;
-  def VUZP1H     : SOpInst<"vuzp1", "ddd", "hQh", OP_UZP1>;
-  def VTRN2H     : SOpInst<"vtrn2", "ddd", "hQh", OP_TRN2>;
-  def VZIP2H     : SOpInst<"vzip2", "ddd", "hQh", OP_ZIP2>;
-  def VUZP2H     : SOpInst<"vuzp2", "ddd", "hQh", OP_UZP2>;
-
-  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "sdi", "Sh">;
-  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "sji", "Sh">;
+  def VTRN1H     : SOpInst<"vtrn1", "...", "hQh", OP_TRN1>;
+  def VZIP1H     : SOpInst<"vzip1", "...", "hQh", OP_ZIP1>;
+  def VUZP1H     : SOpInst<"vuzp1", "...", "hQh", OP_UZP1>;
+  def VTRN2H     : SOpInst<"vtrn2", "...", "hQh", OP_TRN2>;
+  def VZIP2H     : SOpInst<"vzip2", "...", "hQh", OP_ZIP2>;
+  def VUZP2H     : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>;
+
+  def SCALAR_VDUP_LANEH  : IInst<"vdup_lane", "1.I", "Sh">;
+  def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh">;
 }
 
 // v8.2-A dot product instructions.
 let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in {
-  def DOT : SInst<"vdot", "dd88", "iQiUiQUi">;
-  def DOT_LANE : SOpInst<"vdot_lane", "dd87i", "iUiQiQUi", OP_DOT_LN>;
+  def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">;
+  def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<<q)I", "iUiQiQUi", OP_DOT_LN>;
 }
 let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
   // Variants indexing into a 128-bit vector are A64 only.
-  def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
+  def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ>;
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
 let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
-  def VFMLAL_LOW  : SInst<"vfmlal_low",  "nndd", "hQh">;
-  def VFMLSL_LOW  : SInst<"vfmlsl_low",  "nndd", "hQh">;
-  def VFMLAL_HIGH : SInst<"vfmlal_high", "nndd", "hQh">;
-  def VFMLSL_HIGH : SInst<"vfmlsl_high", "nndd", "hQh">;
-
-  def VFMLAL_LANE_LOW  : SOpInst<"vfmlal_lane_low",  "ffH0i", "hQh", OP_FMLAL_LN>;
-  def VFMLSL_LANE_LOW  : SOpInst<"vfmlsl_lane_low",  "ffH0i", "hQh", OP_FMLSL_LN>;
-  def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "hQh", OP_FMLAL_LN_Hi>;
-  def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "hQh", OP_FMLSL_LN_Hi>;
-
-  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "ffH1i", "hQh", OP_FMLAL_LN>;
-  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "ffH1i", "hQh", OP_FMLSL_LN>;
-  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "hQh", OP_FMLAL_LN_Hi>;
-  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "hQh", OP_FMLSL_LN_Hi>;
+  def VFMLAL_LOW  : SInst<"vfmlal_low",  ">>..", "hQh">;
+  def VFMLSL_LOW  : SInst<"vfmlsl_low",  ">>..", "hQh">;
+  def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">;
+  def VFMLSL_HIGH : SInst<"vfmlsl_high", ">>..", "hQh">;
+
+  def VFMLAL_LANE_LOW  : SOpInst<"vfmlal_lane_low",  "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN>;
+  def VFMLSL_LANE_LOW  : SOpInst<"vfmlsl_lane_low",  "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN>;
+  def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>;
+
+  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>;
+  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low",  "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>;
+  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
 }
+
+// v8.3-A Vector complex addition intrinsics
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in {
+  def VCADD_ROT90_FP16   : SInst<"vcadd_rot90", "...", "h">;
+  def VCADD_ROT270_FP16  : SInst<"vcadd_rot270", "...", "h">;
+  def VCADDQ_ROT90_FP16  : SInst<"vcaddq_rot90", "QQQ", "h">;
+  def VCADDQ_ROT270_FP16 : SInst<"vcaddq_rot270", "QQQ", "h">;
+}
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX)" in {
+  def VCADD_ROT90   : SInst<"vcadd_rot90", "...", "f">;
+  def VCADD_ROT270  : SInst<"vcadd_rot270", "...", "f">;
+  def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
+  def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
+}
+let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in {
+  def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
+  def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
+}
\ No newline at end of file
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 984ed787037f6..28b00d162a00d 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -198,10 +198,8 @@ def OP_UNAVAILABLE : Operation {
 //
 // The prototype is a string that defines the return type of the intrinsic
 // and the type of each argument. The return type and every argument gets a
-// "modifier" that can change in some way the "base type" of the intrinsic.
-//
-// The modifier 'd' means "default" and does not modify the base type in any
-// way. The available modifiers are given below.
+// set of "modifiers" that can change in some way the "base type" of the
+// intrinsic.
 //
 // Typespecs
 // ---------
@@ -226,41 +224,34 @@ def OP_UNAVAILABLE : Operation {
 // -------------------
 // prototype: return (arg, arg, ...)
 //
-// v: void
-// t: best-fit integer (int/poly args)
-// x: signed integer   (int/float args)
-// u: unsigned integer (int/float args)
-// f: float (int args)
-// F: double (int args)
-// H: half (int args)
-// 0: half (int args), ignore 'Q' size modifier.
-// 1: half (int args), force 'Q' size modifier.
-// d: default
-// g: default, ignore 'Q' size modifier.
-// j: default, force 'Q' size modifier.
-// w: double width elements, same num elts
-// n: double width elements, half num elts
-// h: half width elements, double num elts
-// q: half width elements, quad num elts
-// e: half width elements, double num elts, unsigned
-// m: half width elements, same num elts
-// i: constant int
-// l: constant uint64
-// s: scalar of element type
-// z: scalar of half width element type, signed
-// r: scalar of double width element type, signed
-// b: scalar of unsigned integer/long type (int/float args)
-// $: scalar of signed integer/long type (int/float args)
-// y: scalar of float
-// o: scalar of double
-// k: default elt width, double num elts
-// 2,3,4: array of default vectors
-// B,C,D: array of default elts, force 'Q' size modifier.
-// p: pointer type
-// c: const pointer type
-// 7: vector of 8-bit elements, ignore 'Q' size modifier
-// 8: vector of 8-bit elements, same width as default type
-// 9: vector of 8-bit elements, force 'Q' size modifier
+// Each type modifier is either a single character, or a group surrounded by
+// parentheses.
+//
+// .: default
+// v: change to void category.
+// S: change to signed integer category.
+// U: change to unsigned integer category.
+// F: change to floating category.
+// P: change to polynomial category.
+// p: change polynomial to equivalent integer category. Otherwise nop.
+//
+// >: double element width (vector size unchanged).
+// <: half element width (vector size unchanged).
+//
+// 1: change to scalar.
+// 2: change to struct of two vectors.
+// 3: change to struct of three vectors.
+// 4: change to struct of four vectors.
+//
+// *: make a pointer argument.
+// c: make a constant argument (for pointers).
+//
+// Q: force 128-bit width.
+// q: force 64-bit width.
+//
+// I: make 32-bit signed scalar immediate
+// !: make this the key type passed to CGBuiltin.cpp in a polymorphic call.
+
 
 // Every intrinsic subclasses Inst.
 class Inst <string n, string p, string t, Operation o> {
diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index da8f819dee964..d382cf77a8b22 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_DRIVER_DISTRO_H
 #define LLVM_CLANG_DRIVER_DISTRO_H
 
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
 namespace clang {
@@ -84,7 +85,7 @@ class Distro {
   Distro(DistroType D) : DistroVal(D) {}
 
   /// Detects the distribution using specified VFS.
-  explicit Distro(llvm::vfs::FileSystem &VFS);
+  explicit Distro(llvm::vfs::FileSystem &VFS, const llvm::Triple &TargetOrHost);
 
   bool operator==(const Distro &Other) const {
     return DistroVal == Other.DistroVal;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9ae5fb54e1cfa..1b7ddb501f150 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -872,8 +872,6 @@ def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
 def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
 def fexperimental_new_constant_interpreter : Flag<["-"], "fexperimental-new-constant-interpreter">, Group<f_Group>,
   HelpText<"Enable the experimental new constant interpreter">, Flags<[CC1Option]>;
-def fforce_experimental_new_constant_interpreter : Flag<["-"], "fforce-experimental-new-constant-interpreter">, Group<f_Group>,
-  HelpText<"Force the use of the experimental new constant interpreter, failing on missing features">, Flags<[CC1Option]>;
 def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
                                     Group<f_Group>;
 def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused, CoreOption]>,
@@ -953,6 +951,10 @@ def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
 def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
 def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
 def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
+def ffp_model_EQ : Joined<["-"], "ffp-model=">, Group<f_Group>, Flags<[DriverOption]>,
+  HelpText<"Controls the semantics of floating-point calculations.">;
+def ffp_exception_behavior_EQ : Joined<["-"], "ffp-exception-behavior=">, Group<f_Group>, Flags<[CC1Option]>,
+  HelpText<"Specifies the exception behavior of floating-point operations.">;
 def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
   HelpText<"Allow aggressive, lossy floating-point optimizations">;
 def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
@@ -1178,6 +1180,8 @@ def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
 // This option was originally misspelt "infinites" [sic].
 def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
 def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
+def frounding_math : Flag<["-"], "frounding-math">, Group<f_Group>, Flags<[CC1Option]>;
+def fno_rounding_math : Flag<["-"], "fno-rounding-math">, Group<f_Group>, Flags<[CC1Option]>;
 def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
 def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
@@ -2023,6 +2027,12 @@ def fdebug_prefix_map_EQ
   : Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>,
     Flags<[CC1Option,CC1AsOption]>,
     HelpText<"remap file source paths in debug info">;
+def ffile_prefix_map_EQ
+  : Joined<["-"], "ffile-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
+    HelpText<"remap file source paths in debug info and predefined preprocessor macros">;
+def fmacro_prefix_map_EQ
+  : Joined<["-"], "fmacro-prefix-map=">, Group<Preprocessor_Group>, Flags<[CC1Option]>,
+    HelpText<"remap file source paths in predefined preprocessor macros">;
 def fforce_dwarf_frame : Flag<["-"], "fforce-dwarf-frame">, Group<f_Group>, Flags<[CC1Option]>,
     HelpText<"Always emit a debug frame section">;
 def fno_force_dwarf_frame : Flag<["-"], "fno-force-dwarf-frame">, Group<f_Group>, Flags<[CC1Option]>,
@@ -2322,9 +2332,8 @@ def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
 def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
   HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
 def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
-foreach i = {6-11} in
-  def ffixed_r#i : Flag<["-"], "ffixed-r"#i>, Group<m_arm_Features_Group>,
-    HelpText<"Reserve the r"#i#" register (ARM only)">;
+def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
+  HelpText<"Reserve the r9 register (ARM only)">;
 def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
   HelpText<"Disallow use of movt/movw pairs (ARM only)">;
 def mcrc : Flag<["-"], "mcrc">, Group<m_Group>,
@@ -3298,7 +3307,6 @@ defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_op
 defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
 defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
-defm rounding_math : BooleanFFlag<"rounding-math">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
 defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 604a935f3a288..f17a10c7f5c80 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -1953,6 +1953,15 @@ struct FormatStyle {
   /// \endcode
   bool SpacesInAngles;
 
+  /// If ``true``, spaces will be inserted around if/for/switch/while
+  /// conditions.
+  /// \code
+  ///    true:                                  false:
+  ///    if ( a )  { ... }              vs.     if (a) { ... }
+  ///    while ( i < 5 )  { ... }               while (i < 5) { ... }
+  /// \endcode
+  bool SpacesInConditionalStatement;
+
   /// If ``true``, spaces are inserted inside container literals (e.g.
   /// ObjC and Javascript array and dict literals).
   /// \code{.js}
@@ -2155,6 +2164,7 @@ struct FormatStyle {
            SpaceInEmptyParentheses == R.SpaceInEmptyParentheses &&
            SpacesBeforeTrailingComments == R.SpacesBeforeTrailingComments &&
            SpacesInAngles == R.SpacesInAngles &&
+           SpacesInConditionalStatement == R.SpacesInConditionalStatement &&
            SpacesInContainerLiterals == R.SpacesInContainerLiterals &&
            SpacesInCStyleCastParentheses == R.SpacesInCStyleCastParentheses &&
            SpacesInParentheses == R.SpacesInParentheses &&
diff --git a/clang/include/clang/Frontend/PrecompiledPreamble.h b/clang/include/clang/Frontend/PrecompiledPreamble.h
index 1a8a64951ec49..5ae77735576cd 100644
--- a/clang/include/clang/Frontend/PrecompiledPreamble.h
+++ b/clang/include/clang/Frontend/PrecompiledPreamble.h
@@ -134,14 +134,6 @@ class PrecompiledPreamble {
     // A main method used to construct TempPCHFile.
     static llvm::ErrorOr<TempPCHFile> CreateNewPreamblePCHFile();
 
-    /// Call llvm::sys::fs::createTemporaryFile to create a new temporary file.
-    static llvm::ErrorOr<TempPCHFile> createInSystemTempDir(const Twine &Prefix,
-                                                            StringRef Suffix);
-    /// Create a new instance of TemporaryFile for file at \p Path. Use with
-    /// extreme caution, there's an assertion checking that there's only a
-    /// single instance of TempPCHFile alive for each path.
-    static llvm::ErrorOr<TempPCHFile> createFromCustomPath(const Twine &Path);
-
   private:
     TempPCHFile(std::string FilePath);
 
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 09b69f85f35f9..e50e38c4844ea 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -218,14 +218,18 @@ createChainedIncludesSource(CompilerInstance &CI,
 /// non-null (and possibly incorrect) CompilerInvocation if any errors were
 /// encountered. When this flag is false, always return null on errors.
 ///
-/// \return A CompilerInvocation, or 0 if none was built for the given
+/// \param CC1Args - if non-null, will be populated with the args to cc1
+/// expanded from \p Args. May be set even if nullptr is returned.
+///
+/// \return A CompilerInvocation, or nullptr if none was built for the given
 /// argument vector.
 std::unique_ptr<CompilerInvocation> createInvocationFromCommandLine(
     ArrayRef<const char *> Args,
     IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
         IntrusiveRefCntPtr<DiagnosticsEngine>(),
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr,
-    bool ShouldRecoverOnErrors = false);
+    bool ShouldRecoverOnErrors = false,
+    std::vector<std::string> *CC1Args = nullptr);
 
 /// Return the value of the last argument as an integer, or a default. If Diags
 /// is non-null, emits an error if the argument is given, but non-integral.
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index e2ddc80d503f1..9716196b95c23 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -932,6 +932,12 @@ class Preprocessor {
     return TheModuleLoader.HadFatalFailure;
   }
 
+  /// Retrieve the number of Directives that have been processed by the
+  /// Preprocessor.
+  unsigned getNumDirectives() const {
+    return NumDirectives;
+  }
+
   /// True if we are currently preprocessing a #if or #elif directive
   bool isParsingIfOrElifDirective() const {
     return ParsingIfOrElifDirective;
diff --git a/clang/include/clang/Lex/PreprocessorOptions.h b/clang/include/clang/Lex/PreprocessorOptions.h
index 344afa8941723..abffbd03c3b48 100644
--- a/clang/include/clang/Lex/PreprocessorOptions.h
+++ b/clang/include/clang/Lex/PreprocessorOptions.h
@@ -13,6 +13,8 @@
 #include "clang/Lex/PreprocessorExcludedConditionalDirectiveSkipMapping.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include <functional>
+#include <map>
 #include <memory>
 #include <set>
 #include <string>
@@ -173,6 +175,9 @@ class PreprocessorOptions {
   /// build it again.
   std::shared_ptr<FailedModulesSet> FailedModules;
 
+  /// A prefix map for __FILE__ and __BASE_FILE__.
+  std::map<std::string, std::string, std::greater<std::string>> MacroPrefixMap;
+
   /// Contains the currently active skipped range mappings for skipping excluded
   /// conditional directives.
   ///
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 74518278c1c7b..d271a65f48b41 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1122,6 +1122,11 @@ class Parser : public CodeCompletionHandler {
   /// point for skipping past a simple-declaration.
   void SkipMalformedDecl();
 
+  /// The location of the first statement inside an else that might
+  /// have a missleading indentation. If there is no
+  /// MisleadingIndentationChecker on an else active, this location is invalid.
+  SourceLocation MisleadingIndentationElseLoc;
+
 private:
   //===--------------------------------------------------------------------===//
   // Lexing and parsing of C++ inline methods.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 808113e1b9657..bab94c01117b3 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -4035,6 +4035,9 @@ class Sema final {
   /// Add the given method to the list of globally-known methods.
   void addMethodToGlobalList(ObjCMethodList *List, ObjCMethodDecl *Method);
 
+  /// Returns default addr space for method qualifiers.
+  LangAS getDefaultCXXMethodAddrSpace() const;
+
 private:
   /// AddMethodToGlobalPool - Add an instance or factory method to the global
   /// pool. See descriptoin of AddInstanceMethodToGlobalPool.
@@ -4464,9 +4467,11 @@ class Sema final {
 
   typedef ProcessingContextState ParsingClassState;
   ParsingClassState PushParsingClass() {
+    ParsingClassDepth++;
     return DelayedDiagnostics.pushUndelayed();
   }
   void PopParsingClass(ParsingClassState state) {
+    ParsingClassDepth--;
     DelayedDiagnostics.popUndelayed(state);
   }
 
@@ -6519,7 +6524,7 @@ class Sema final {
                                          SourceLocation RBrac,
                                          const ParsedAttributesView &AttrList);
   void ActOnFinishCXXMemberDecls();
-  void ActOnFinishCXXNonNestedClass(Decl *D);
+  void ActOnFinishCXXNonNestedClass();
 
   void ActOnReenterCXXMethodParameter(Scope *S, ParmVarDecl *Param);
   unsigned ActOnReenterTemplateScope(Scope *S, Decl *Template);
@@ -8897,6 +8902,8 @@ class Sema final {
   bool CheckARCMethodDecl(ObjCMethodDecl *method);
   bool inferObjCARCLifetime(ValueDecl *decl);
 
+  void deduceOpenCLAddressSpace(ValueDecl *decl);
+
   ExprResult
   HandleExprPropertyRefExpr(const ObjCObjectPointerType *OPT,
                             Expr *BaseExpr,
@@ -11428,6 +11435,8 @@ class Sema final {
   bool CheckHexagonBuiltinCpu(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckHexagonBuiltinArgument(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckMipsBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckMipsBuiltinCpu(unsigned BuiltinID, CallExpr *TheCall);
+  bool CheckMipsBuiltinArgument(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckSystemZBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinRoundingOrSAE(unsigned BuiltinID, CallExpr *TheCall);
   bool CheckX86BuiltinGatherScatterScale(unsigned BuiltinID, CallExpr *TheCall);
@@ -11690,6 +11699,8 @@ class Sema final {
   SmallVector<CXXMethodDecl*, 4> DelayedDllExportMemberFunctions;
 
 private:
+  int ParsingClassDepth = 0;
+
   class SavePendingParsedClassStateRAII {
   public:
     SavePendingParsedClassStateRAII(Sema &S) : S(S) { swapSavedState(); }
@@ -11699,8 +11710,6 @@ class Sema final {
              "there shouldn't be any pending delayed exception spec checks");
       assert(S.DelayedEquivalentExceptionSpecChecks.empty() &&
              "there shouldn't be any pending delayed exception spec checks");
-      assert(S.DelayedDllExportClasses.empty() &&
-             "there shouldn't be any pending delayed DLL export classes");
       swapSavedState();
     }
 
@@ -11710,14 +11719,12 @@ class Sema final {
         SavedOverridingExceptionSpecChecks;
     decltype(DelayedEquivalentExceptionSpecChecks)
         SavedEquivalentExceptionSpecChecks;
-    decltype(DelayedDllExportClasses) SavedDllExportClasses;
 
     void swapSavedState() {
       SavedOverridingExceptionSpecChecks.swap(
           S.DelayedOverridingExceptionSpecChecks);
       SavedEquivalentExceptionSpecChecks.swap(
           S.DelayedEquivalentExceptionSpecChecks);
-      SavedDllExportClasses.swap(S.DelayedDllExportClasses);
     }
   };
 
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index f0b5e99338232..b6dae68b3413b 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -551,6 +551,14 @@ class ASTReader
   llvm::DenseMap<Decl*, llvm::SmallVector<NamedDecl*, 2>>
     AnonymousDeclarationsForMerging;
 
+  /// Key used to identify LifetimeExtendedTemporaryDecl for merging,
+  /// containing the lifetime-extending declaration and the mangling number.
+  using LETemporaryKey = std::pair<Decl *, unsigned>;
+
+  /// Map of already deserialiazed temporaries.
+  llvm::DenseMap<LETemporaryKey, LifetimeExtendedTemporaryDecl *>
+      LETemporaryForMerging;
+
   struct FileDeclsInfo {
     ModuleFile *Mod = nullptr;
     ArrayRef<serialization::LocalDeclID> Decls;
diff --git a/clang/include/clang/Tooling/CompilationDatabase.h b/clang/include/clang/Tooling/CompilationDatabase.h
index dea046a2dc7c3..b28a8a6d6e51c 100644
--- a/clang/include/clang/Tooling/CompilationDatabase.h
+++ b/clang/include/clang/Tooling/CompilationDatabase.h
@@ -31,6 +31,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 #include <string>
 #include <utility>
@@ -219,6 +220,12 @@ std::unique_ptr<CompilationDatabase>
 std::unique_ptr<CompilationDatabase>
 inferTargetAndDriverMode(std::unique_ptr<CompilationDatabase> Base);
 
+/// Returns a wrapped CompilationDatabase that will expand all rsp(response)
+/// files on commandline returned by underlying database.
+std::unique_ptr<CompilationDatabase>
+expandResponseFiles(std::unique_ptr<CompilationDatabase> Base,
+                    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
+
 } // namespace tooling
 } // namespace clang
 
diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h
index c40b6bd24817f..c4db4da892c2d 100644
--- a/clang/include/clang/Tooling/Syntax/Nodes.h
+++ b/clang/include/clang/Tooling/Syntax/Nodes.h
@@ -37,7 +37,6 @@ namespace syntax {
 enum class NodeKind : uint16_t {
   Leaf,
   TranslationUnit,
-  TopLevelDeclaration,
 
   // Expressions
   UnknownExpression,
@@ -57,7 +56,11 @@ enum class NodeKind : uint16_t {
   ReturnStatement,
   RangeBasedForStatement,
   ExpressionStatement,
-  CompoundStatement
+  CompoundStatement,
+
+  // Declarations
+  UnknownDeclaration,
+  SimpleDeclaration,
 };
 /// For debugging purposes.
 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NodeKind K);
@@ -102,20 +105,6 @@ class TranslationUnit final : public Tree {
   }
 };
 
-/// FIXME: this node is temporary and will be replaced with nodes for various
-///        'declarations' and 'declarators' from the C/C++ grammar
-///
-/// Represents any top-level declaration. Only there to give the syntax tree a
-/// bit of structure until we implement syntax nodes for declarations and
-/// declarators.
-class TopLevelDeclaration final : public Tree {
-public:
-  TopLevelDeclaration() : Tree(NodeKind::TopLevelDeclaration) {}
-  static bool classof(const Node *N) {
-    return N->kind() == NodeKind::TopLevelDeclaration;
-  }
-};
-
 /// A base class for all expressions. Note that expressions are not statements,
 /// even though they are in clang.
 class Expression : public Tree {
@@ -313,6 +302,38 @@ class CompoundStatement final : public Statement {
   syntax::Leaf *rbrace();
 };
 
+/// A declaration that can appear at the top-level. Note that this does *not*
+/// correspond 1-to-1 to clang::Decl. Syntax trees distinguish between top-level
+/// declarations (e.g. namespace definitions) and declarators (e.g. variables,
+/// typedefs, etc.). Declarators are stored inside SimpleDeclaration.
+class Declaration : public Tree {
+public:
+  Declaration(NodeKind K) : Tree(K) {}
+  static bool classof(const Node *N) {
+    return NodeKind::UnknownDeclaration <= N->kind() &&
+           N->kind() <= NodeKind::SimpleDeclaration;
+  }
+};
+
+/// Declaration of an unknown kind, e.g. not yet supported in syntax trees.
+class UnknownDeclaration final : public Declaration {
+public:
+  UnknownDeclaration() : Declaration(NodeKind::UnknownDeclaration) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::UnknownDeclaration;
+  }
+};
+
+/// Groups multiple declarators (e.g. variables, typedefs, etc.) together. All
+/// grouped declarators share the same declaration specifiers (e.g. 'int' or
+/// 'typedef').
+class SimpleDeclaration final : public Declaration {
+public:
+  SimpleDeclaration() : Declaration(NodeKind::SimpleDeclaration) {}
+  static bool classof(const Node *N) {
+    return N->kind() == NodeKind::SimpleDeclaration;
+  }
+};
 } // namespace syntax
 } // namespace clang
 #endif
diff --git a/clang/include/clang/Tooling/Syntax/Tokens.h b/clang/include/clang/Tooling/Syntax/Tokens.h
index 301432d3888b3..6f4d0e0c050af 100644
--- a/clang/include/clang/Tooling/Syntax/Tokens.h
+++ b/clang/include/clang/Tooling/Syntax/Tokens.h
@@ -175,6 +175,7 @@ class TokenBuffer {
   /// All tokens produced by the preprocessor after all macro replacements,
   /// directives, etc. Source locations found in the clang AST will always
   /// point to one of these tokens.
+  /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
   ///        into two '>' tokens by the parser. However, TokenBuffer currently
   ///        keeps it as a single '>>' token.
@@ -182,6 +183,10 @@ class TokenBuffer {
     return ExpandedTokens;
   }
 
+  /// Returns the subrange of expandedTokens() corresponding to the closed
+  /// token range R.
+  llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
+
   /// Find the subrange of spelled tokens that produced the corresponding \p
   /// Expanded tokens.
   ///
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index bd29a6991afe5..2ed523b741b15 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -771,6 +771,7 @@ CXXABI *ASTContext::createCXXABI(const TargetInfo &T) {
   if (!LangOpts.CPlusPlus) return nullptr;
 
   switch (T.getCXXABI().getKind()) {
+  case TargetCXXABI::Fuchsia:
   case TargetCXXABI::GenericARM: // Same as Itanium at this level
   case TargetCXXABI::iOS:
   case TargetCXXABI::iOS64:
@@ -3880,10 +3881,11 @@ QualType ASTContext::getFunctionTypeInternal(
   auto ESH = FunctionProtoType::getExceptionSpecSize(
       EPI.ExceptionSpec.Type, EPI.ExceptionSpec.Exceptions.size());
   size_t Size = FunctionProtoType::totalSizeToAlloc<
-      QualType, FunctionType::FunctionTypeExtraBitfields,
+      QualType, SourceLocation, FunctionType::FunctionTypeExtraBitfields,
       FunctionType::ExceptionType, Expr *, FunctionDecl *,
       FunctionProtoType::ExtParameterInfo, Qualifiers>(
-      NumArgs, FunctionProtoType::hasExtraBitfields(EPI.ExceptionSpec.Type),
+      NumArgs, EPI.Variadic,
+      FunctionProtoType::hasExtraBitfields(EPI.ExceptionSpec.Type),
       ESH.NumExceptionType, ESH.NumExprPtr, ESH.NumFunctionDeclPtr,
       EPI.ExtParameterInfos ? NumArgs : 0,
       EPI.TypeQuals.hasNonFastQualifiers() ? 1 : 0);
@@ -10177,6 +10179,7 @@ MangleContext *ASTContext::createMangleContext(const TargetInfo *T) {
   if (!T)
     T = Target;
   switch (T->getCXXABI().getKind()) {
+  case TargetCXXABI::Fuchsia:
   case TargetCXXABI::GenericAArch64:
   case TargetCXXABI::GenericItanium:
   case TargetCXXABI::GenericARM:
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 3723c868004fe..0301110b7067f 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2793,6 +2793,7 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
   FunctionDeclBits.ConstexprKind = ConstexprKind;
   FunctionDeclBits.InstantiationIsPending = false;
   FunctionDeclBits.UsesSEHTry = false;
+  FunctionDeclBits.UsesFPIntrin = false;
   FunctionDeclBits.HasSkippedBody = false;
   FunctionDeclBits.WillHaveBody = false;
   FunctionDeclBits.IsMultiVersion = false;
@@ -3356,6 +3357,22 @@ SourceRange FunctionDecl::getReturnTypeSourceRange() const {
   return RTRange;
 }
 
+SourceRange FunctionDecl::getParametersSourceRange() const {
+  unsigned NP = getNumParams();
+  SourceLocation EllipsisLoc = getEllipsisLoc();
+
+  if (NP == 0 && EllipsisLoc.isInvalid())
+    return SourceRange();
+
+  SourceLocation Begin =
+      NP > 0 ? ParamInfo[0]->getSourceRange().getBegin() : EllipsisLoc;
+  SourceLocation End = EllipsisLoc.isValid()
+                           ? EllipsisLoc
+                           : ParamInfo[NP - 1]->getSourceRange().getEnd();
+
+  return SourceRange(Begin, End);
+}
+
 SourceRange FunctionDecl::getExceptionSpecSourceRange() const {
   FunctionTypeLoc FTL = getFunctionTypeLoc();
   return FTL ? FTL.getExceptionSpecRange() : SourceRange();
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index c4fd5cd1c3c65..03a6d8c9bcff2 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -1754,6 +1754,15 @@ MemberExpr *MemberExpr::Create(
   MemberExpr *E = new (Mem) MemberExpr(Base, IsArrow, OperatorLoc, MemberDecl,
                                        NameInfo, T, VK, OK, NOUR);
 
+  if (isa<FieldDecl>(MemberDecl)) {
+    DeclContext *DC = MemberDecl->getDeclContext();
+    // dyn_cast_or_null is used to handle objC variables which do not
+    // have a declaration context.
+    CXXRecordDecl *RD = dyn_cast_or_null<CXXRecordDecl>(DC);
+    if (RD && RD->isDependentContext() && RD->isCurrentInstantiation(DC))
+      E->setTypeDependent(T->isDependentType());
+  }
+
   if (HasQualOrFound) {
     // FIXME: Wrong. We should be looking at the member declaration we found.
     if (QualifierLoc && QualifierLoc.getNestedNameSpecifier()->isDependent()) {
@@ -1890,7 +1899,7 @@ bool CastExpr::CastConsistency() const {
     auto Ty = getType();
     auto SETy = getSubExpr()->getType();
     assert(getValueKindForType(Ty) == Expr::getValueKindForType(SETy));
-    if (/*isRValue()*/ !Ty->getPointeeType().isNull()) {
+    if (isRValue()) {
       Ty = Ty->getPointeeType();
       SETy = SETy->getPointeeType();
     }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 79659261388b4..7a17b76f05d3d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -763,11 +763,8 @@ namespace {
     /// we will evaluate.
     unsigned StepsLeft;
 
-    /// Force the use of the experimental new constant interpreter, bailing out
-    /// with an error if a feature is not supported.
-    bool ForceNewConstInterp;
-
-    /// Enable the experimental new constant interpreter.
+    /// Enable the experimental new constant interpreter. If an expression is
+    /// not supported by the interpreter, an error is triggered.
     bool EnableNewConstInterp;
 
     /// BottomFrame - The frame in which evaluation started. This must be
@@ -921,10 +918,8 @@ namespace {
     EvalInfo(const ASTContext &C, Expr::EvalStatus &S, EvaluationMode Mode)
         : Ctx(const_cast<ASTContext &>(C)), EvalStatus(S), CurrentCall(nullptr),
           CallStackDepth(0), NextCallIndex(1),
-          StepsLeft(getLangOpts().ConstexprStepLimit),
-          ForceNewConstInterp(getLangOpts().ForceNewConstInterp),
-          EnableNewConstInterp(ForceNewConstInterp ||
-                               getLangOpts().EnableNewConstInterp),
+          StepsLeft(C.getLangOpts().ConstexprStepLimit),
+          EnableNewConstInterp(C.getLangOpts().EnableNewConstInterp),
           BottomFrame(*this, SourceLocation(), nullptr, nullptr, nullptr),
           EvaluatingDecl((const ValueDecl *)nullptr),
           EvaluatingDeclValue(nullptr), HasActiveDiagnostic(false),
@@ -7866,6 +7861,11 @@ class PointerExprEvaluator
     // either copied into the closure object's field that represents the '*this'
     // or refers to '*this'.
     if (isLambdaCallOperator(Info.CurrentCall->Callee)) {
+      // Ensure we actually have captured 'this'. (an error will have
+      // been previously reported if not).
+      if (!Info.CurrentCall->LambdaThisCaptureField)
+        return false;
+
       // Update 'Result' to refer to the data member/field of the closure object
       // that represents the '*this' capture.
       if (!HandleLValueMember(Info, E, Result,
@@ -13400,32 +13400,25 @@ static bool EvaluateInPlace(APValue &Result, EvalInfo &Info, const LValue &This,
 /// EvaluateAsRValue - Try to evaluate this expression, performing an implicit
 /// lvalue-to-rvalue cast if it is an lvalue.
 static bool EvaluateAsRValue(EvalInfo &Info, const Expr *E, APValue &Result) {
-   if (Info.EnableNewConstInterp) {
-    auto &InterpCtx = Info.Ctx.getInterpContext();
-    switch (InterpCtx.evaluateAsRValue(Info, E, Result)) {
-    case interp::InterpResult::Success:
-      return true;
-    case interp::InterpResult::Fail:
+  if (Info.EnableNewConstInterp) {
+    if (!Info.Ctx.getInterpContext().evaluateAsRValue(Info, E, Result))
+      return false;
+  } else {
+    if (E->getType().isNull())
       return false;
-    case interp::InterpResult::Bail:
-      break;
-    }
-  }
-
-  if (E->getType().isNull())
-    return false;
-
-  if (!CheckLiteralType(Info, E))
-    return false;
 
-  if (!::Evaluate(Result, Info, E))
-    return false;
+    if (!CheckLiteralType(Info, E))
+      return false;
 
-  if (E->isGLValue()) {
-    LValue LV;
-    LV.setFrom(Info.Ctx, Result);
-    if (!handleLValueToRValueConversion(Info, E, E->getType(), LV, Result))
+    if (!::Evaluate(Result, Info, E))
       return false;
+
+    if (E->isGLValue()) {
+      LValue LV;
+      LV.setFrom(Info.Ctx, Result);
+      if (!handleLValueToRValueConversion(Info, E, E->getType(), LV, Result))
+        return false;
+    }
   }
 
   // Check this core constant expression is a constant expression.
@@ -13637,46 +13630,36 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
 
   if (Info.EnableNewConstInterp) {
     auto &InterpCtx = const_cast<ASTContext &>(Ctx).getInterpContext();
-    switch (InterpCtx.evaluateAsInitializer(Info, VD, Value)) {
-    case interp::InterpResult::Fail:
-      // Bail out if an error was encountered.
-      return false;
-    case interp::InterpResult::Success:
-      // Evaluation succeeded and value was set.
-      return CheckConstantExpression(Info, DeclLoc, DeclTy, Value);
-    case interp::InterpResult::Bail:
-      // Evaluate the value again for the tree evaluator to use.
-      break;
+    if (!InterpCtx.evaluateAsInitializer(Info, VD, Value))
+      return false;
+  } else {
+    LValue LVal;
+    LVal.set(VD);
+
+    // C++11 [basic.start.init]p2:
+    //  Variables with static storage duration or thread storage duration shall
+    //  be zero-initialized before any other initialization takes place.
+    // This behavior is not present in C.
+    if (Ctx.getLangOpts().CPlusPlus && !VD->hasLocalStorage() &&
+        !DeclTy->isReferenceType()) {
+      ImplicitValueInitExpr VIE(DeclTy);
+      if (!EvaluateInPlace(Value, Info, LVal, &VIE,
+                           /*AllowNonLiteralTypes=*/true))
+        return false;
     }
-  }
-
-  LValue LVal;
-  LVal.set(VD);
 
-  // C++11 [basic.start.init]p2:
-  //  Variables with static storage duration or thread storage duration shall be
-  //  zero-initialized before any other initialization takes place.
-  // This behavior is not present in C.
-  if (Ctx.getLangOpts().CPlusPlus && !VD->hasLocalStorage() &&
-      !DeclTy->isReferenceType()) {
-    ImplicitValueInitExpr VIE(DeclTy);
-    if (!EvaluateInPlace(Value, Info, LVal, &VIE,
-                         /*AllowNonLiteralTypes=*/true))
+    if (!EvaluateInPlace(Value, Info, LVal, this,
+                         /*AllowNonLiteralTypes=*/true) ||
+        EStatus.HasSideEffects)
       return false;
-  }
-
-  if (!EvaluateInPlace(Value, Info, LVal, this,
-                       /*AllowNonLiteralTypes=*/true) ||
-      EStatus.HasSideEffects)
-    return false;
-
-  // At this point, any lifetime-extended temporaries are completely
-  // initialized.
-  Info.performLifetimeExtension();
 
-  if (!Info.discardCleanups())
-    llvm_unreachable("Unhandled cleanup; missing full expression marker?");
+    // At this point, any lifetime-extended temporaries are completely
+    // initialized.
+    Info.performLifetimeExtension();
 
+    if (!Info.discardCleanups())
+      llvm_unreachable("Unhandled cleanup; missing full expression marker?");
+  }
   return CheckConstantExpression(Info, DeclLoc, DeclTy, Value) &&
          CheckMemoryLeaks(Info);
 }
@@ -14415,14 +14398,8 @@ bool Expr::isPotentialConstantExpr(const FunctionDecl *FD,
 
   // The constexpr VM attempts to compile all methods to bytecode here.
   if (Info.EnableNewConstInterp) {
-    auto &InterpCtx = Info.Ctx.getInterpContext();
-    switch (InterpCtx.isPotentialConstantExpr(Info, FD)) {
-    case interp::InterpResult::Success:
-    case interp::InterpResult::Fail:
-      return Diags.empty();
-    case interp::InterpResult::Bail:
-      break;
-    }
+    Info.Ctx.getInterpContext().isPotentialConstantExpr(Info, FD);
+    return Diags.empty();
   }
 
   const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD);
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 4f8f7b96e7c32..e7f9ba0f010ae 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -21,44 +21,37 @@
 using namespace clang;
 using namespace clang::interp;
 
-Context::Context(ASTContext &Ctx)
-    : Ctx(Ctx), ForceInterp(getLangOpts().ForceNewConstInterp),
-      P(new Program(*this)) {}
+Context::Context(ASTContext &Ctx) : Ctx(Ctx), P(new Program(*this)) {}
 
 Context::~Context() {}
 
-InterpResult Context::isPotentialConstantExpr(State &Parent,
-                                              const FunctionDecl *FD) {
+bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
   Function *Func = P->getFunction(FD);
   if (!Func) {
     if (auto R = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD)) {
       Func = *R;
-    } else if (ForceInterp) {
+    } else {
       handleAllErrors(R.takeError(), [&Parent](ByteCodeGenError &Err) {
         Parent.FFDiag(Err.getLoc(), diag::err_experimental_clang_interp_failed);
       });
-      return InterpResult::Fail;
-    } else {
-      consumeError(R.takeError());
-      return InterpResult::Bail;
+      return false;
     }
   }
 
   if (!Func->isConstexpr())
-    return InterpResult::Fail;
+    return false;
 
   APValue Dummy;
   return Run(Parent, Func, Dummy);
 }
 
-InterpResult Context::evaluateAsRValue(State &Parent, const Expr *E,
-                                       APValue &Result) {
+bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) {
   ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk, Result);
   return Check(Parent, C.interpretExpr(E));
 }
 
-InterpResult Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
-                                            APValue &Result) {
+bool Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
+                                    APValue &Result) {
   ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk, Result);
   return Check(Parent, C.interpretDecl(VD));
 }
@@ -116,33 +109,20 @@ unsigned Context::getCharBit() const {
   return Ctx.getTargetInfo().getCharWidth();
 }
 
-InterpResult Context::Run(State &Parent, Function *Func, APValue &Result) {
-  InterpResult Flag;
-  {
-    InterpState State(Parent, *P, Stk, *this);
-    State.Current = new InterpFrame(State, Func, nullptr, {}, {});
-    if (Interpret(State, Result)) {
-      Flag = InterpResult::Success;
-    } else {
-      Flag = InterpResult::Fail;
-    }
-  }
-
-  if (Flag != InterpResult::Success)
-    Stk.clear();
-  return Flag;
+bool Context::Run(State &Parent, Function *Func, APValue &Result) {
+  InterpState State(Parent, *P, Stk, *this);
+  State.Current = new InterpFrame(State, Func, nullptr, {}, {});
+  if (Interpret(State, Result))
+    return true;
+  Stk.clear();
+  return false;
 }
 
-InterpResult Context::Check(State &Parent, llvm::Expected<bool> &&R) {
-  if (R) {
-    return *R ? InterpResult::Success : InterpResult::Fail;
-  } else if (ForceInterp) {
-    handleAllErrors(R.takeError(), [&Parent](ByteCodeGenError &Err) {
-      Parent.FFDiag(Err.getLoc(), diag::err_experimental_clang_interp_failed);
-    });
-    return InterpResult::Fail;
-  } else {
-    consumeError(R.takeError());
-    return InterpResult::Bail;
-  }
+bool Context::Check(State &Parent, llvm::Expected<bool> &&Flag) {
+  if (Flag)
+    return *Flag;
+  handleAllErrors(Flag.takeError(), [&Parent](ByteCodeGenError &Err) {
+    Parent.FFDiag(Err.getLoc(), diag::err_experimental_clang_interp_failed);
+  });
+  return false;
 }
diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h
index 96368b6e5f02f..e4d831cbb9912 100644
--- a/clang/lib/AST/Interp/Context.h
+++ b/clang/lib/AST/Interp/Context.h
@@ -34,16 +34,6 @@ class Program;
 class State;
 enum PrimType : unsigned;
 
-/// Wrapper around interpreter termination results.
-enum class InterpResult {
-  /// Interpreter successfully computed a value.
-  Success,
-  /// Interpreter encountered an error and quit.
-  Fail,
-  /// Interpreter encountered an unimplemented feature, AST fallback.
-  Bail,
-};
-
 /// Holds all information required to evaluate constexpr code in a module.
 class Context {
 public:
@@ -54,15 +44,13 @@ class Context {
   ~Context();
 
   /// Checks if a function is a potential constant expression.
-  InterpResult isPotentialConstantExpr(State &Parent,
-                                       const FunctionDecl *FnDecl);
+  bool isPotentialConstantExpr(State &Parent, const FunctionDecl *FnDecl);
 
   /// Evaluates a toplevel expression as an rvalue.
-  InterpResult evaluateAsRValue(State &Parent, const Expr *E, APValue &Result);
+  bool evaluateAsRValue(State &Parent, const Expr *E, APValue &Result);
 
   /// Evaluates a toplevel initializer.
-  InterpResult evaluateAsInitializer(State &Parent, const VarDecl *VD,
-                                     APValue &Result);
+  bool evaluateAsInitializer(State &Parent, const VarDecl *VD, APValue &Result);
 
   /// Returns the AST context.
   ASTContext &getASTContext() const { return Ctx; }
@@ -78,16 +66,14 @@ class Context {
 
 private:
   /// Runs a function.
-  InterpResult Run(State &Parent, Function *Func, APValue &Result);
+  bool Run(State &Parent, Function *Func, APValue &Result);
 
   /// Checks a result fromt the interpreter.
-  InterpResult Check(State &Parent, llvm::Expected<bool> &&R);
+  bool Check(State &Parent, llvm::Expected<bool> &&R);
 
 private:
   /// Current compilation context.
   ASTContext &Ctx;
-  /// Flag to indicate if the use of the interpreter is mandatory.
-  bool ForceInterp;
   /// Interpreter stack, shared across invocations.
   InterpStack Stk;
   /// Constexpr program.
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 274cc25b8bb8d..40c6c8375a606 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -202,14 +202,20 @@ void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc,
   PresumedLoc Presumed = SM.getPresumedLoc(Loc);
   unsigned ActualLine = IsSpelling ? SM.getSpellingLineNumber(Loc)
                                    : SM.getExpansionLineNumber(Loc);
+  StringRef ActualFile = SM.getBufferName(Loc);
+
   if (Presumed.isValid()) {
     JOS.attribute("offset", SM.getDecomposedLoc(Loc).second);
-    if (LastLocFilename != Presumed.getFilename()) {
-      JOS.attribute("file", Presumed.getFilename());
+    if (LastLocFilename != ActualFile) {
+      JOS.attribute("file", ActualFile);
       JOS.attribute("line", ActualLine);
     } else if (LastLocLine != ActualLine)
       JOS.attribute("line", ActualLine);
 
+    StringRef PresumedFile = Presumed.getFilename();
+    if (PresumedFile != ActualFile && LastLocPresumedFilename != PresumedFile)
+      JOS.attribute("presumedFile", PresumedFile);
+
     unsigned PresumedLine = Presumed.getLine();
     if (ActualLine != PresumedLine && LastLocPresumedLine != PresumedLine)
       JOS.attribute("presumedLine", PresumedLine);
@@ -217,7 +223,8 @@ void JSONNodeDumper::writeBareSourceLocation(SourceLocation Loc,
     JOS.attribute("col", Presumed.getColumn());
     JOS.attribute("tokLen",
                   Lexer::MeasureTokenLength(Loc, SM, Ctx.getLangOpts()));
-    LastLocFilename = Presumed.getFilename();
+    LastLocFilename = ActualFile;
+    LastLocPresumedFilename = PresumedFile;
     LastLocPresumedLine = PresumedLine;
     LastLocLine = ActualLine;
 
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 0ff95213118fd..561c76a45cbc2 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1338,6 +1338,17 @@ void TextNodeDumper::VisitFunctionDecl(const FunctionDecl *D) {
     OS << " <<<NULL params x " << D->getNumParams() << ">>>";
 }
 
+void TextNodeDumper::VisitLifetimeExtendedTemporaryDecl(
+    const LifetimeExtendedTemporaryDecl *D) {
+  OS << " extended by ";
+  dumpBareDeclRef(D->getExtendingDecl());
+  OS << " mangling ";
+  {
+    ColorScope Color(OS, ShowColors, ValueColor);
+    OS << D->getManglingNumber();
+  }
+}
+
 void TextNodeDumper::VisitFieldDecl(const FieldDecl *D) {
   dumpName(D);
   dumpType(D->getType());
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 4fed5b410b172..2eae2ebb61741 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3065,6 +3065,12 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef<QualType> params,
   } else {
     FunctionTypeBits.HasExtQuals = 0;
   }
+
+  // Fill in the Ellipsis location info if present.
+  if (epi.Variadic) {
+    auto &EllipsisLoc = *getTrailingObjects<SourceLocation>();
+    EllipsisLoc = epi.EllipsisLoc;
+  }
 }
 
 bool FunctionProtoType::hasDependentExceptionSpec() const {
diff --git a/clang/lib/Analysis/BodyFarm.cpp b/clang/lib/Analysis/BodyFarm.cpp
index 694913b3ac937..1a7891550542d 100644
--- a/clang/lib/Analysis/BodyFarm.cpp
+++ b/clang/lib/Analysis/BodyFarm.cpp
@@ -741,13 +741,17 @@ static Stmt *createObjCPropertyGetter(ASTContext &Ctx,
     // First, find the backing ivar.
   const ObjCIvarDecl *IVar = nullptr;
 
-  // Property accessor stubs sometimes do not correspond to any property.
+  // Property accessor stubs sometimes do not correspond to any property decl
+  // in the current interface (but in a superclass). They still have a
+  // corresponding property impl decl in this case.
   if (MD->isSynthesizedAccessorStub()) {
     const ObjCInterfaceDecl *IntD = MD->getClassInterface();
     const ObjCImplementationDecl *ImpD = IntD->getImplementation();
-    for (const auto *V: ImpD->ivars()) {
-      if (V->getName() == MD->getSelector().getNameForSlot(0))
-        IVar = V;
+    for (const auto *PI: ImpD->property_impls()) {
+      if (const ObjCPropertyDecl *P = PI->getPropertyDecl()) {
+        if (P->getGetterName() == MD->getSelector())
+          IVar = P->getPropertyIvarDecl();
+      }
     }
   }
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 5214f7c30ee0a..cba3e3ada7ea5 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -158,6 +158,7 @@ void AArch64TargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
 
 void AArch64TargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
                                                 MacroBuilder &Builder) const {
+  Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
   Builder.defineMacro("__ARM_FEATURE_JCVT", "1");
   // Also include the Armv8.2 defines
   getTargetDefinesARMV82A(Opts, Builder);
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index 92e5e26eba3c2..be088e81cffe4 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -580,6 +580,13 @@ void ARMTargetInfo::getTargetDefinesARMV82A(const LangOptions &Opts,
   getTargetDefinesARMV81A(Opts, Builder);
 }
 
+void ARMTargetInfo::getTargetDefinesARMV83A(const LangOptions &Opts,
+                                            MacroBuilder &Builder) const {
+  // Also include the ARMv8.2-A defines
+  Builder.defineMacro("__ARM_FEATURE_COMPLEX", "1");
+  getTargetDefinesARMV82A(Opts, Builder);
+}
+
 void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
                                      MacroBuilder &Builder) const {
   // Target identification.
@@ -809,6 +816,11 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV8_2A:
     getTargetDefinesARMV82A(Opts, Builder);
     break;
+  case llvm::ARM::ArchKind::ARMV8_3A:
+  case llvm::ARM::ArchKind::ARMV8_4A:
+  case llvm::ARM::ArchKind::ARMV8_5A:
+    getTargetDefinesARMV83A(Opts, Builder);
+    break;
   }
 }
 
@@ -879,38 +891,6 @@ ArrayRef<TargetInfo::GCCRegAlias> ARMTargetInfo::getGCCRegAliases() const {
   return llvm::makeArrayRef(GCCRegAliases);
 }
 
-bool ARMTargetInfo::validateGlobalRegisterVariable(
-    StringRef RegName, unsigned RegSize, bool &HasSizeMismatch) const {
-  bool isValid = llvm::StringSwitch<bool>(RegName)
-                     .Case("r6", true)
-                     .Case("r7", true)
-                     .Case("r8", true)
-                     .Case("r9", true)
-                     .Case("r10", true)
-                     .Case("r11", true)
-                     .Case("sp", true)
-                     .Default(false);
-  HasSizeMismatch = false;
-  return isValid;
-}
-
-bool ARMTargetInfo::isRegisterReservedGlobally(StringRef RegName) const {
-  // The "sp" register does not have a -ffixed-sp option,
-  // so reserve it unconditionally.
-  if (RegName.equals("sp"))
-    return true;
-
-  // reserve rN (N:6-11) registers only if the corresponding
-  // +reserve-rN feature is found
-  const std::vector<std::string> &Features = getTargetOpts().Features;
-  const std::string SearchFeature = "+reserve-" + RegName.str();
-  for (const std::string &Feature : Features) {
-    if (Feature.compare(SearchFeature) == 0)
-      return true;
-  }
-  return false;
-}
-
 bool ARMTargetInfo::validateAsmConstraint(
     const char *&Name, TargetInfo::ConstraintInfo &Info) const {
   switch (*Name) {
diff --git a/clang/lib/Basic/Targets/ARM.h b/clang/lib/Basic/Targets/ARM.h
index 90fb20f8f7a5f..9696a44045891 100644
--- a/clang/lib/Basic/Targets/ARM.h
+++ b/clang/lib/Basic/Targets/ARM.h
@@ -148,9 +148,10 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
 
   void getTargetDefinesARMV81A(const LangOptions &Opts,
                                MacroBuilder &Builder) const;
-
   void getTargetDefinesARMV82A(const LangOptions &Opts,
                                MacroBuilder &Builder) const;
+  void getTargetDefinesARMV83A(const LangOptions &Opts,
+                                 MacroBuilder &Builder) const;
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
 
@@ -161,9 +162,6 @@ class LLVM_LIBRARY_VISIBILITY ARMTargetInfo : public TargetInfo {
 
   ArrayRef<const char *> getGCCRegNames() const override;
   ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
-  bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize,
-                                      bool &HasSizeMismatch) const override;
-  bool isRegisterReservedGlobally(StringRef RegName) const override;
   bool validateAsmConstraint(const char *&Name,
                              TargetInfo::ConstraintInfo &Info) const override;
   std::string convertConstraint(const char *&Constraint) const override;
diff --git a/clang/lib/Basic/Targets/Mips.cpp b/clang/lib/Basic/Targets/Mips.cpp
index b9ab80df61940..ead5e91f7c8f2 100644
--- a/clang/lib/Basic/Targets/Mips.cpp
+++ b/clang/lib/Basic/Targets/Mips.cpp
@@ -213,7 +213,10 @@ void MipsTargetInfo::getTargetDefines(const LangOptions &Opts,
 bool MipsTargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
       .Case("mips", true)
+      .Case("dsp", DspRev >= DSP1)
+      .Case("dspr2", DspRev >= DSP2)
       .Case("fp64", FPMode == FP64)
+      .Case("msa", HasMSA)
       .Default(false);
 }
 
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index 72fdb0e7dde8a..d4ffffc64ba8d 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -180,7 +180,7 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
 
     if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
       if (Opts.CPlusPlus2a)
-        Builder.defineMacro("_MSVC_LANG", "201704L");
+        Builder.defineMacro("_MSVC_LANG", "201705L");
       else if (Opts.CPlusPlus17)
         Builder.defineMacro("_MSVC_LANG", "201703L");
       else if (Opts.CPlusPlus14)
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index cc72a0a39f30f..756cb7a8bbe3c 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -808,6 +808,7 @@ class LLVM_LIBRARY_VISIBILITY FuchsiaTargetInfo : public OSTargetInfo<Target> {
   FuchsiaTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
       : OSTargetInfo<Target>(Triple, Opts) {
     this->MCountName = "__mcount";
+    this->TheCXXABI.set(TargetCXXABI::Fuchsia);
   }
 };
 
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 039fe6da84201..d07aaf58681c2 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -139,7 +139,7 @@ namespace {
     const LValue &getAtomicLValue() const { return LVal; }
     llvm::Value *getAtomicPointer() const {
       if (LVal.isSimple())
-        return LVal.getPointer();
+        return LVal.getPointer(CGF);
       else if (LVal.isBitField())
         return LVal.getBitFieldPointer();
       else if (LVal.isVectorElt())
@@ -343,7 +343,7 @@ bool AtomicInfo::requiresMemSetZero(llvm::Type *type) const {
 
 bool AtomicInfo::emitMemSetZeroIfNecessary() const {
   assert(LVal.isSimple());
-  llvm::Value *addr = LVal.getPointer();
+  llvm::Value *addr = LVal.getPointer(CGF);
   if (!requiresMemSetZero(addr->getType()->getPointerElementType()))
     return false;
 
@@ -1628,7 +1628,7 @@ Address AtomicInfo::materializeRValue(RValue rvalue) const {
   LValue TempLV = CGF.MakeAddrLValue(CreateTempAlloca(), getAtomicType());
   AtomicInfo Atomics(CGF, TempLV);
   Atomics.emitCopyIntoMemory(rvalue);
-  return TempLV.getAddress();
+  return TempLV.getAddress(CGF);
 }
 
 llvm::Value *AtomicInfo::convertRValueToInt(RValue RVal) const {
@@ -1975,8 +1975,8 @@ void CodeGenFunction::EmitAtomicStore(RValue rvalue, LValue dest,
   // If this is an aggregate r-value, it should agree in type except
   // maybe for address-space qualification.
   assert(!rvalue.isAggregate() ||
-         rvalue.getAggregateAddress().getElementType()
-           == dest.getAddress().getElementType());
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress(*this).getElementType());
 
   AtomicInfo atomics(*this, dest);
   LValue LVal = atomics.getAtomicLValue();
@@ -2043,10 +2043,10 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
   // maybe for address-space qualification.
   assert(!Expected.isAggregate() ||
          Expected.getAggregateAddress().getElementType() ==
-             Obj.getAddress().getElementType());
+             Obj.getAddress(*this).getElementType());
   assert(!Desired.isAggregate() ||
          Desired.getAggregateAddress().getElementType() ==
-             Obj.getAddress().getElementType());
+             Obj.getAddress(*this).getElementType());
   AtomicInfo Atomics(*this, Obj);
 
   return Atomics.EmitAtomicCompareExchange(Expected, Desired, Success, Failure,
@@ -2086,13 +2086,11 @@ void CodeGenFunction::EmitAtomicInit(Expr *init, LValue dest) {
     }
 
     // Evaluate the expression directly into the destination.
-    AggValueSlot slot = AggValueSlot::forLValue(dest,
-                                        AggValueSlot::IsNotDestructed,
-                                        AggValueSlot::DoesNotNeedGCBarriers,
-                                        AggValueSlot::IsNotAliased,
-                                        AggValueSlot::DoesNotOverlap,
-                                        Zeroed ? AggValueSlot::IsZeroed :
-                                                 AggValueSlot::IsNotZeroed);
+    AggValueSlot slot = AggValueSlot::forLValue(
+        dest, *this, AggValueSlot::IsNotDestructed,
+        AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased,
+        AggValueSlot::DoesNotOverlap,
+        Zeroed ? AggValueSlot::IsZeroed : AggValueSlot::IsNotZeroed);
 
     EmitAggExpr(init, slot);
     return;
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index f90d9439af257..6a1a73955319c 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -1076,7 +1076,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                           /*RefersToEnclosingVariableOrCapture*/ CI.isNested(),
                           type.getNonReferenceType(), VK_LValue,
                           SourceLocation());
-      src = EmitDeclRefLValue(&declRef).getAddress();
+      src = EmitDeclRefLValue(&declRef).getAddress(*this);
     };
 
     // For byrefs, we just write the pointer to the byref struct into
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 676ea85e89e02..9b5fe9530210f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3367,7 +3367,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(Carry);
   }
   case Builtin::BI__builtin_addressof:
-    return RValue::get(EmitLValue(E->getArg(0)).getPointer());
+    return RValue::get(EmitLValue(E->getArg(0)).getPointer(*this));
   case Builtin::BI__builtin_operator_new:
     return EmitBuiltinNewDeleteCall(
         E->getCallee()->getType()->castAs<FunctionProtoType>(), E, false);
@@ -3750,8 +3750,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Value *Queue = EmitScalarExpr(E->getArg(0));
     llvm::Value *Flags = EmitScalarExpr(E->getArg(1));
     LValue NDRangeL = EmitAggExprToLValue(E->getArg(2));
-    llvm::Value *Range = NDRangeL.getAddress().getPointer();
-    llvm::Type *RangeTy = NDRangeL.getAddress().getType();
+    llvm::Value *Range = NDRangeL.getAddress(*this).getPointer();
+    llvm::Type *RangeTy = NDRangeL.getAddress(*this).getType();
 
     if (NumArgs == 4) {
       // The most basic form of the call with parameters:
@@ -3770,7 +3770,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
           Builder.CreatePointerCast(Info.BlockArg, GenericVoidPtrTy);
 
       AttrBuilder B;
-      B.addByValAttr(NDRangeL.getAddress().getElementType());
+      B.addByValAttr(NDRangeL.getAddress(*this).getElementType());
       llvm::AttributeList ByValAttrSet =
           llvm::AttributeList::get(CGM.getModule().getContext(), 3U, B);
 
@@ -3955,7 +3955,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     llvm::Type *GenericVoidPtrTy = Builder.getInt8PtrTy(
         getContext().getTargetAddressSpace(LangAS::opencl_generic));
     LValue NDRangeL = EmitAggExprToLValue(E->getArg(0));
-    llvm::Value *NDRange = NDRangeL.getAddress().getPointer();
+    llvm::Value *NDRange = NDRangeL.getAddress(*this).getPointer();
     auto Info =
         CGM.getOpenCLRuntime().emitOpenCLEnqueuedBlock(*this, E->getArg(1));
     Value *Kernel = Builder.CreatePointerCast(Info.Kernel, GenericVoidPtrTy);
@@ -4458,6 +4458,10 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP1(vaesmcq_v, arm_neon_aesmc, 0),
   NEONMAP1(vbsl_v, arm_neon_vbsl, AddRetType),
   NEONMAP1(vbslq_v, arm_neon_vbsl, AddRetType),
+  NEONMAP1(vcadd_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
+  NEONMAP1(vcadd_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
+  NEONMAP1(vcaddq_rot270_v, arm_neon_vcadd_rot270, Add1ArgType),
+  NEONMAP1(vcaddq_rot90_v, arm_neon_vcadd_rot90, Add1ArgType),
   NEONMAP1(vcage_v, arm_neon_vacge, 0),
   NEONMAP1(vcageq_v, arm_neon_vacge, 0),
   NEONMAP1(vcagt_v, arm_neon_vacgt, 0),
@@ -4625,10 +4629,10 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP2(vpmin_v, arm_neon_vpminu, arm_neon_vpmins, Add1ArgType | UnsignedAlts),
   NEONMAP1(vqabs_v, arm_neon_vqabs, Add1ArgType),
   NEONMAP1(vqabsq_v, arm_neon_vqabs, Add1ArgType),
-  NEONMAP2(vqadd_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
-  NEONMAP2(vqaddq_v, arm_neon_vqaddu, arm_neon_vqadds, Add1ArgType | UnsignedAlts),
-  NEONMAP2(vqdmlal_v, arm_neon_vqdmull, arm_neon_vqadds, 0),
-  NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, arm_neon_vqsubs, 0),
+  NEONMAP2(vqadd_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqaddq_v, uadd_sat, sadd_sat, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqdmlal_v, arm_neon_vqdmull, sadd_sat, 0),
+  NEONMAP2(vqdmlsl_v, arm_neon_vqdmull, ssub_sat, 0),
   NEONMAP1(vqdmulh_v, arm_neon_vqdmulh, Add1ArgType),
   NEONMAP1(vqdmulhq_v, arm_neon_vqdmulh, Add1ArgType),
   NEONMAP1(vqdmull_v, arm_neon_vqdmull, Add1ArgType),
@@ -4646,8 +4650,8 @@ static const NeonIntrinsicInfo ARMSIMDIntrinsicMap [] = {
   NEONMAP2(vqshlq_v, arm_neon_vqshiftu, arm_neon_vqshifts, Add1ArgType | UnsignedAlts),
   NEONMAP1(vqshlu_n_v, arm_neon_vqshiftsu, 0),
   NEONMAP1(vqshluq_n_v, arm_neon_vqshiftsu, 0),
-  NEONMAP2(vqsub_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
-  NEONMAP2(vqsubq_v, arm_neon_vqsubu, arm_neon_vqsubs, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsub_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
+  NEONMAP2(vqsubq_v, usub_sat, ssub_sat, Add1ArgType | UnsignedAlts),
   NEONMAP1(vraddhn_v, arm_neon_vraddhn, Add1ArgType),
   NEONMAP2(vrecpe_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
   NEONMAP2(vrecpeq_v, arm_neon_vrecpe, arm_neon_vrecpe, 0),
@@ -4731,6 +4735,10 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
   NEONMAP1(vaeseq_v, aarch64_crypto_aese, 0),
   NEONMAP1(vaesimcq_v, aarch64_crypto_aesimc, 0),
   NEONMAP1(vaesmcq_v, aarch64_crypto_aesmc, 0),
+  NEONMAP1(vcadd_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
+  NEONMAP1(vcadd_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
+  NEONMAP1(vcaddq_rot270_v, aarch64_neon_vcadd_rot270, Add1ArgType),
+  NEONMAP1(vcaddq_rot90_v, aarch64_neon_vcadd_rot90, Add1ArgType),
   NEONMAP1(vcage_v, aarch64_neon_facge, 0),
   NEONMAP1(vcageq_v, aarch64_neon_facge, 0),
   NEONMAP1(vcagt_v, aarch64_neon_facgt, 0),
@@ -9466,14 +9474,14 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID,
   if (!getDebugInfo()) {
     CGM.Error(E->getExprLoc(), "using builtin_preserve_field_info() without -g");
     return IsBitField ? EmitLValue(Arg).getBitFieldPointer()
-                      : EmitLValue(Arg).getPointer();
+                      : EmitLValue(Arg).getPointer(*this);
   }
 
   // Enable underlying preserve_*_access_index() generation.
   bool OldIsInPreservedAIRegion = IsInPreservedAIRegion;
   IsInPreservedAIRegion = true;
   Value *FieldAddr = IsBitField ? EmitLValue(Arg).getBitFieldPointer()
-                                : EmitLValue(Arg).getPointer();
+                                : EmitLValue(Arg).getPointer(*this);
   IsInPreservedAIRegion = OldIsInPreservedAIRegion;
 
   ConstantInt *C = cast<ConstantInt>(EmitScalarExpr(E->getArg(1)));
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 3d292f84c79b4..fe778e3714347 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1020,13 +1020,13 @@ void CodeGenFunction::ExpandTypeFromArgs(
 
   auto Exp = getTypeExpansion(Ty, getContext());
   if (auto CAExp = dyn_cast<ConstantArrayExpansion>(Exp.get())) {
-    forConstantArrayExpansion(*this, CAExp, LV.getAddress(),
-                              [&](Address EltAddr) {
-      LValue LV = MakeAddrLValue(EltAddr, CAExp->EltTy);
-      ExpandTypeFromArgs(CAExp->EltTy, LV, AI);
-    });
+    forConstantArrayExpansion(
+        *this, CAExp, LV.getAddress(*this), [&](Address EltAddr) {
+          LValue LV = MakeAddrLValue(EltAddr, CAExp->EltTy);
+          ExpandTypeFromArgs(CAExp->EltTy, LV, AI);
+        });
   } else if (auto RExp = dyn_cast<RecordExpansion>(Exp.get())) {
-    Address This = LV.getAddress();
+    Address This = LV.getAddress(*this);
     for (const CXXBaseSpecifier *BS : RExp->Bases) {
       // Perform a single step derived-to-base conversion.
       Address Base =
@@ -1047,8 +1047,13 @@ void CodeGenFunction::ExpandTypeFromArgs(
     auto imagValue = *AI++;
     EmitStoreOfComplex(ComplexPairTy(realValue, imagValue), LV, /*init*/ true);
   } else {
+    // Call EmitStoreOfScalar except when the lvalue is a bitfield to emit a
+    // primitive store.
     assert(isa<NoExpansion>(Exp.get()));
-    EmitStoreThroughLValue(RValue::get(*AI++), LV);
+    if (LV.isBitField())
+      EmitStoreThroughLValue(RValue::get(*AI++), LV);
+    else
+      EmitStoreOfScalar(*AI++, LV);
   }
 }
 
@@ -1057,7 +1062,7 @@ void CodeGenFunction::ExpandTypeToArgs(
     SmallVectorImpl<llvm::Value *> &IRCallArgs, unsigned &IRCallArgPos) {
   auto Exp = getTypeExpansion(Ty, getContext());
   if (auto CAExp = dyn_cast<ConstantArrayExpansion>(Exp.get())) {
-    Address Addr = Arg.hasLValue() ? Arg.getKnownLValue().getAddress()
+    Address Addr = Arg.hasLValue() ? Arg.getKnownLValue().getAddress(*this)
                                    : Arg.getKnownRValue().getAggregateAddress();
     forConstantArrayExpansion(
         *this, CAExp, Addr, [&](Address EltAddr) {
@@ -1068,7 +1073,7 @@ void CodeGenFunction::ExpandTypeToArgs(
                            IRCallArgPos);
         });
   } else if (auto RExp = dyn_cast<RecordExpansion>(Exp.get())) {
-    Address This = Arg.hasLValue() ? Arg.getKnownLValue().getAddress()
+    Address This = Arg.hasLValue() ? Arg.getKnownLValue().getAddress(*this)
                                    : Arg.getKnownRValue().getAggregateAddress();
     for (const CXXBaseSpecifier *BS : RExp->Bases) {
       // Perform a single step derived-to-base conversion.
@@ -3141,7 +3146,7 @@ static bool isProvablyNull(llvm::Value *addr) {
 static void emitWriteback(CodeGenFunction &CGF,
                           const CallArgList::Writeback &writeback) {
   const LValue &srcLV = writeback.Source;
-  Address srcAddr = srcLV.getAddress();
+  Address srcAddr = srcLV.getAddress(CGF);
   assert(!isProvablyNull(srcAddr.getPointer()) &&
          "shouldn't have writeback for provably null argument");
 
@@ -3249,7 +3254,7 @@ static void emitWritebackArg(CodeGenFunction &CGF, CallArgList &args,
       CRE->getSubExpr()->getType()->castAs<PointerType>()->getPointeeType();
     srcLV = CGF.MakeAddrLValue(srcAddr, srcAddrType);
   }
-  Address srcAddr = srcLV.getAddress();
+  Address srcAddr = srcLV.getAddress(CGF);
 
   // The dest and src types don't necessarily match in LLVM terms
   // because of the crazy ObjC compatibility rules.
@@ -3563,7 +3568,7 @@ RValue CallArg::getRValue(CodeGenFunction &CGF) const {
   CGF.EmitAggregateCopy(Copy, LV, Ty, AggValueSlot::DoesNotOverlap,
                         LV.isVolatile());
   IsUsed = true;
-  return RValue::getAggregate(Copy.getAddress());
+  return RValue::getAggregate(Copy.getAddress(CGF));
 }
 
 void CallArg::copyInto(CodeGenFunction &CGF, Address Addr) const {
@@ -3573,7 +3578,7 @@ void CallArg::copyInto(CodeGenFunction &CGF, Address Addr) const {
   else if (!HasLV && RV.isComplex())
     CGF.EmitStoreOfComplex(RV.getComplexVal(), Dst, /*init=*/true);
   else {
-    auto Addr = HasLV ? LV.getAddress() : RV.getAggregateAddress();
+    auto Addr = HasLV ? LV.getAddress(CGF) : RV.getAggregateAddress();
     LValue SrcLV = CGF.MakeAddrLValue(Addr, Ty);
     // We assume that call args are never copied into subobjects.
     CGF.EmitAggregateCopy(Dst, SrcLV, Ty, AggValueSlot::DoesNotOverlap,
@@ -3936,7 +3941,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       if (I->isAggregate()) {
         // Replace the placeholder with the appropriate argument slot GEP.
         Address Addr = I->hasLValue()
-                           ? I->getKnownLValue().getAddress()
+                           ? I->getKnownLValue().getAddress(*this)
                            : I->getKnownRValue().getAggregateAddress();
         llvm::Instruction *Placeholder =
             cast<llvm::Instruction>(Addr.getPointer());
@@ -3981,7 +3986,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         // 3. If the argument is byval, but RV is not located in default
         //    or alloca address space.
         Address Addr = I->hasLValue()
-                           ? I->getKnownLValue().getAddress()
+                           ? I->getKnownLValue().getAddress(*this)
                            : I->getKnownRValue().getAggregateAddress();
         llvm::Value *V = Addr.getPointer();
         CharUnits Align = ArgInfo.getIndirectAlign();
@@ -4068,7 +4073,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
           V = I->getKnownRValue().getScalarVal();
         else
           V = Builder.CreateLoad(
-              I->hasLValue() ? I->getKnownLValue().getAddress()
+              I->hasLValue() ? I->getKnownLValue().getAddress(*this)
                              : I->getKnownRValue().getAggregateAddress());
 
         // Implement swifterror by copying into a new swifterror argument.
@@ -4122,7 +4127,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
         Src = CreateMemTemp(I->Ty, "coerce");
         I->copyInto(*this, Src);
       } else {
-        Src = I->hasLValue() ? I->getKnownLValue().getAddress()
+        Src = I->hasLValue() ? I->getKnownLValue().getAddress(*this)
                              : I->getKnownRValue().getAggregateAddress();
       }
 
@@ -4177,7 +4182,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
       Address addr = Address::invalid();
       Address AllocaAddr = Address::invalid();
       if (I->isAggregate()) {
-        addr = I->hasLValue() ? I->getKnownLValue().getAddress()
+        addr = I->hasLValue() ? I->getKnownLValue().getAddress(*this)
                               : I->getKnownRValue().getAggregateAddress();
 
       } else {
@@ -4359,6 +4364,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
                              Callee.getAbstractInfo(), Attrs, CallingConv,
                              /*AttrOnCallSite=*/true);
 
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(CurFuncDecl))
+    if (FD->usesFPIntrin())
+      // All calls within a strictfp function are marked strictfp
+      Attrs =
+        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
+                           llvm::Attribute::StrictFP);
+
   // Apply some call-site-specific attributes.
   // TODO: work this into building the attribute set.
 
@@ -4408,6 +4420,13 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   SmallVector<llvm::OperandBundleDef, 1> BundleList =
       getBundlesForFunclet(CalleePtr);
 
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(CurFuncDecl))
+    if (FD->usesFPIntrin())
+      // All calls within a strictfp function are marked strictfp
+      Attrs =
+        Attrs.addAttribute(getLLVMContext(), llvm::AttributeList::FunctionIndex,
+                           llvm::Attribute::StrictFP);
+
   // Emit the actual call/invoke instruction.
   llvm::CallBase *CI;
   if (!InvokeDest) {
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index bcc58dcbc2c6c..ebd70302ba4cc 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -657,7 +657,7 @@ static void EmitMemberInitializer(CodeGenFunction &CGF,
       // the constructor.
       QualType::DestructionKind dtorKind = FieldType.isDestructedType();
       if (CGF.needsEHCleanup(dtorKind))
-        CGF.pushEHDestroy(dtorKind, LHS.getAddress(), FieldType);
+        CGF.pushEHDestroy(dtorKind, LHS.getAddress(CGF), FieldType);
       return;
     }
   }
@@ -681,16 +681,12 @@ void CodeGenFunction::EmitInitializerForField(FieldDecl *Field, LValue LHS,
     EmitComplexExprIntoLValue(Init, LHS, /*isInit*/ true);
     break;
   case TEK_Aggregate: {
-    AggValueSlot Slot =
-        AggValueSlot::forLValue(
-            LHS,
-            AggValueSlot::IsDestructed,
-            AggValueSlot::DoesNotNeedGCBarriers,
-            AggValueSlot::IsNotAliased,
-            getOverlapForFieldInit(Field),
-            AggValueSlot::IsNotZeroed,
-            // Checks are made by the code that calls constructor.
-            AggValueSlot::IsSanitizerChecked);
+    AggValueSlot Slot = AggValueSlot::forLValue(
+        LHS, *this, AggValueSlot::IsDestructed,
+        AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased,
+        getOverlapForFieldInit(Field), AggValueSlot::IsNotZeroed,
+        // Checks are made by the code that calls constructor.
+        AggValueSlot::IsSanitizerChecked);
     EmitAggExpr(Init, Slot);
     break;
   }
@@ -700,7 +696,7 @@ void CodeGenFunction::EmitInitializerForField(FieldDecl *Field, LValue LHS,
   // later in the constructor.
   QualType::DestructionKind dtorKind = FieldType.isDestructedType();
   if (needsEHCleanup(dtorKind))
-    pushEHDestroy(dtorKind, LHS.getAddress(), FieldType);
+    pushEHDestroy(dtorKind, LHS.getAddress(*this), FieldType);
 }
 
 /// Checks whether the given constructor is a valid subject for the
@@ -914,6 +910,8 @@ namespace {
     }
 
     void addMemcpyableField(FieldDecl *F) {
+      if (F->isZeroSize(CGF.getContext()))
+        return;
       if (!FirstField)
         addInitialField(F);
       else
@@ -961,9 +959,10 @@ namespace {
       LValue SrcLV = CGF.MakeNaturalAlignAddrLValue(SrcPtr, RecordTy);
       LValue Src = CGF.EmitLValueForFieldInitialization(SrcLV, FirstField);
 
-      emitMemcpyIR(Dest.isBitField() ? Dest.getBitFieldAddress() : Dest.getAddress(),
-                   Src.isBitField() ? Src.getBitFieldAddress() : Src.getAddress(),
-                   MemcpySize);
+      emitMemcpyIR(
+          Dest.isBitField() ? Dest.getBitFieldAddress() : Dest.getAddress(CGF),
+          Src.isBitField() ? Src.getBitFieldAddress() : Src.getAddress(CGF),
+          MemcpySize);
       reset();
     }
 
@@ -1117,7 +1116,7 @@ namespace {
           continue;
         LValue FieldLHS = LHS;
         EmitLValueForAnyFieldInitialization(CGF, MemberInit, FieldLHS);
-        CGF.pushEHDestroy(dtorKind, FieldLHS.getAddress(), FieldType);
+        CGF.pushEHDestroy(dtorKind, FieldLHS.getAddress(CGF), FieldType);
       }
     }
 
@@ -1627,7 +1626,7 @@ namespace {
       LValue LV = CGF.EmitLValueForField(ThisLV, field);
       assert(LV.isSimple());
 
-      CGF.emitDestroy(LV.getAddress(), field->getType(), destroyer,
+      CGF.emitDestroy(LV.getAddress(CGF), field->getType(), destroyer,
                       flags.isForNormalCleanup() && useEHCleanupForArray);
     }
   };
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index db5893a7b51f2..8d6406c027738 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -1141,10 +1141,11 @@ llvm::DIType *CGDebugInfo::CreateType(const TypedefType *Ty,
   // declared.
   SourceLocation Loc = Ty->getDecl()->getLocation();
 
+  uint32_t Align = getDeclAlignIfRequired(Ty->getDecl(), CGM.getContext());
   // Typedefs are derived from some other type.
   return DBuilder.createTypedef(Underlying, Ty->getDecl()->getName(),
                                 getOrCreateFile(Loc), getLineNumber(Loc),
-                                getDeclContextDescriptor(Ty->getDecl()));
+                                getDeclContextDescriptor(Ty->getDecl()), Align);
 }
 
 static unsigned getDwarfCC(CallingConv CC) {
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 13e9c7a38fccd..8e74f7e019655 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -83,7 +83,8 @@ class CGDebugInfo {
   /// Cache of previously constructed Types.
   llvm::DenseMap<const void *, llvm::TrackingMDRef> TypeCache;
 
-  llvm::SmallDenseMap<llvm::StringRef, llvm::StringRef> DebugPrefixMap;
+  std::map<llvm::StringRef, llvm::StringRef, std::greater<llvm::StringRef>>
+      DebugPrefixMap;
 
   /// Cache that maps VLA types to size expressions for that type,
   /// represented by instantiated Metadata nodes.
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index a984f67f61768..56ddc983dafcd 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -597,7 +597,7 @@ namespace {
                       Var.getType(), VK_LValue, SourceLocation());
       // Compute the address of the local variable, in case it's a byref
       // or something.
-      llvm::Value *Addr = CGF.EmitDeclRefLValue(&DRE).getPointer();
+      llvm::Value *Addr = CGF.EmitDeclRefLValue(&DRE).getPointer(CGF);
 
       // In some cases, the type of the function argument will be different from
       // the type of the pointer. An example of this is
@@ -712,18 +712,18 @@ static bool tryEmitARCCopyWeakInit(CodeGenFunction &CGF,
       LValue srcLV = CGF.EmitLValue(srcExpr);
 
       // Handle a formal type change to avoid asserting.
-      auto srcAddr = srcLV.getAddress();
+      auto srcAddr = srcLV.getAddress(CGF);
       if (needsCast) {
-        srcAddr = CGF.Builder.CreateElementBitCast(srcAddr,
-                                         destLV.getAddress().getElementType());
+        srcAddr = CGF.Builder.CreateElementBitCast(
+            srcAddr, destLV.getAddress(CGF).getElementType());
       }
 
       // If it was an l-value, use objc_copyWeak.
       if (srcExpr->getValueKind() == VK_LValue) {
-        CGF.EmitARCCopyWeak(destLV.getAddress(), srcAddr);
+        CGF.EmitARCCopyWeak(destLV.getAddress(CGF), srcAddr);
       } else {
         assert(srcExpr->getValueKind() == VK_XValue);
-        CGF.EmitARCMoveWeak(destLV.getAddress(), srcAddr);
+        CGF.EmitARCMoveWeak(destLV.getAddress(CGF), srcAddr);
       }
       return true;
     }
@@ -741,7 +741,7 @@ static bool tryEmitARCCopyWeakInit(CodeGenFunction &CGF,
 static void drillIntoBlockVariable(CodeGenFunction &CGF,
                                    LValue &lvalue,
                                    const VarDecl *var) {
-  lvalue.setAddress(CGF.emitBlockByrefAddress(lvalue.getAddress(), var));
+  lvalue.setAddress(CGF.emitBlockByrefAddress(lvalue.getAddress(CGF), var));
 }
 
 void CodeGenFunction::EmitNullabilityCheck(LValue LHS, llvm::Value *RHS,
@@ -801,17 +801,18 @@ void CodeGenFunction::EmitScalarInit(const Expr *init, const ValueDecl *D,
     if (capturedByInit) {
       // We can use a simple GEP for this because it can't have been
       // moved yet.
-      tempLV.setAddress(emitBlockByrefAddress(tempLV.getAddress(),
+      tempLV.setAddress(emitBlockByrefAddress(tempLV.getAddress(*this),
                                               cast<VarDecl>(D),
                                               /*follow*/ false));
     }
 
-    auto ty = cast<llvm::PointerType>(tempLV.getAddress().getElementType());
+    auto ty =
+        cast<llvm::PointerType>(tempLV.getAddress(*this).getElementType());
     llvm::Value *zero = CGM.getNullPointer(ty, tempLV.getType());
 
     // If __weak, we want to use a barrier under certain conditions.
     if (lifetime == Qualifiers::OCL_Weak)
-      EmitARCInitWeak(tempLV.getAddress(), zero);
+      EmitARCInitWeak(tempLV.getAddress(*this), zero);
 
     // Otherwise just do a simple store.
     else
@@ -854,9 +855,9 @@ void CodeGenFunction::EmitScalarInit(const Expr *init, const ValueDecl *D,
 
     if (capturedByInit) drillIntoBlockVariable(*this, lvalue, cast<VarDecl>(D));
     if (accessedByInit)
-      EmitARCStoreWeak(lvalue.getAddress(), value, /*ignored*/ true);
+      EmitARCStoreWeak(lvalue.getAddress(*this), value, /*ignored*/ true);
     else
-      EmitARCInitWeak(lvalue.getAddress(), value);
+      EmitARCInitWeak(lvalue.getAddress(*this), value);
     return;
   }
 
@@ -1940,11 +1941,10 @@ void CodeGenFunction::EmitExprAsInit(const Expr *init, const ValueDecl *D,
       else if (auto *FD = dyn_cast<FieldDecl>(D))
         Overlap = getOverlapForFieldInit(FD);
       // TODO: how can we delay here if D is captured by its initializer?
-      EmitAggExpr(init, AggValueSlot::forLValue(lvalue,
-                                              AggValueSlot::IsDestructed,
-                                         AggValueSlot::DoesNotNeedGCBarriers,
-                                              AggValueSlot::IsNotAliased,
-                                              Overlap));
+      EmitAggExpr(init, AggValueSlot::forLValue(
+                            lvalue, *this, AggValueSlot::IsDestructed,
+                            AggValueSlot::DoesNotNeedGCBarriers,
+                            AggValueSlot::IsNotAliased, Overlap));
     }
     return;
   }
@@ -2500,7 +2500,7 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
             // objc_storeStrong attempts to release its old value.
             llvm::Value *Null = CGM.EmitNullConstant(D.getType());
             EmitStoreOfScalar(Null, lv, /* isInitialization */ true);
-            EmitARCStoreStrongCall(lv.getAddress(), ArgVal, true);
+            EmitARCStoreStrongCall(lv.getAddress(*this), ArgVal, true);
             DoStore = false;
           }
           else
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 03351dbe0672e..d54dd87e3c00c 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -56,10 +56,11 @@ static void EmitDeclInit(CodeGenFunction &CGF, const VarDecl &D,
     CGF.EmitComplexExprIntoLValue(Init, lv, /*isInit*/ true);
     return;
   case TEK_Aggregate:
-    CGF.EmitAggExpr(Init, AggValueSlot::forLValue(lv,AggValueSlot::IsDestructed,
-                                          AggValueSlot::DoesNotNeedGCBarriers,
-                                                  AggValueSlot::IsNotAliased,
-                                                  AggValueSlot::DoesNotOverlap));
+    CGF.EmitAggExpr(Init,
+                    AggValueSlot::forLValue(lv, CGF, AggValueSlot::IsDestructed,
+                                            AggValueSlot::DoesNotNeedGCBarriers,
+                                            AggValueSlot::IsNotAliased,
+                                            AggValueSlot::DoesNotOverlap));
     return;
   }
   llvm_unreachable("bad evaluation kind");
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 6becd35976d85..35009a1c285c8 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -573,7 +573,7 @@ EmitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *M) {
       LV = EmitLValueForField(LV, Adjustment.Field);
       assert(LV.isSimple() &&
              "materialized temporary field is not a simple lvalue");
-      Object = LV.getAddress();
+      Object = LV.getAddress(*this);
       break;
     }
 
@@ -594,7 +594,7 @@ CodeGenFunction::EmitReferenceBindingToExpr(const Expr *E) {
   // Emit the expression as an lvalue.
   LValue LV = EmitLValue(E);
   assert(LV.isSimple());
-  llvm::Value *Value = LV.getPointer();
+  llvm::Value *Value = LV.getPointer(*this);
 
   if (sanitizePerformTypeCheck() && !E->getType()->isFunctionType()) {
     // C++11 [dcl.ref]p5 (as amended by core issue 453):
@@ -1127,7 +1127,7 @@ Address CodeGenFunction::EmitPointerWithAlignment(const Expr *E,
       LValue LV = EmitLValue(UO->getSubExpr());
       if (BaseInfo) *BaseInfo = LV.getBaseInfo();
       if (TBAAInfo) *TBAAInfo = LV.getTBAAInfo();
-      return LV.getAddress();
+      return LV.getAddress(*this);
     }
   }
 
@@ -1217,8 +1217,8 @@ LValue CodeGenFunction::EmitCheckedLValue(const Expr *E, TypeCheckKind TCK) {
       if (IsBaseCXXThis || isa<DeclRefExpr>(ME->getBase()))
         SkippedChecks.set(SanitizerKind::Null, true);
     }
-    EmitTypeCheck(TCK, E->getExprLoc(), LV.getPointer(),
-                  E->getType(), LV.getAlignment(), SkippedChecks);
+    EmitTypeCheck(TCK, E->getExprLoc(), LV.getPointer(*this), E->getType(),
+                  LV.getAlignment(), SkippedChecks);
   }
   return LV;
 }
@@ -1305,7 +1305,7 @@ LValue CodeGenFunction::EmitLValue(const Expr *E) {
     if (LV.isSimple()) {
       // Defend against branches out of gnu statement expressions surrounded by
       // cleanups.
-      llvm::Value *V = LV.getPointer();
+      llvm::Value *V = LV.getPointer(*this);
       Scope.ForceCleanup({&V});
       return LValue::MakeAddr(Address(V, LV.getAlignment()), LV.getType(),
                               getContext(), LV.getBaseInfo(), LV.getTBAAInfo());
@@ -1521,7 +1521,7 @@ llvm::Value *CodeGenFunction::emitScalarConstant(
 
 llvm::Value *CodeGenFunction::EmitLoadOfScalar(LValue lvalue,
                                                SourceLocation Loc) {
-  return EmitLoadOfScalar(lvalue.getAddress(), lvalue.isVolatile(),
+  return EmitLoadOfScalar(lvalue.getAddress(*this), lvalue.isVolatile(),
                           lvalue.getType(), Loc, lvalue.getBaseInfo(),
                           lvalue.getTBAAInfo(), lvalue.isNontemporal());
 }
@@ -1771,7 +1771,7 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *Value, Address Addr,
 
 void CodeGenFunction::EmitStoreOfScalar(llvm::Value *value, LValue lvalue,
                                         bool isInit) {
-  EmitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
+  EmitStoreOfScalar(value, lvalue.getAddress(*this), lvalue.isVolatile(),
                     lvalue.getType(), lvalue.getBaseInfo(),
                     lvalue.getTBAAInfo(), isInit, lvalue.isNontemporal());
 }
@@ -1782,18 +1782,18 @@ void CodeGenFunction::EmitStoreOfScalar(llvm::Value *value, LValue lvalue,
 RValue CodeGenFunction::EmitLoadOfLValue(LValue LV, SourceLocation Loc) {
   if (LV.isObjCWeak()) {
     // load of a __weak object.
-    Address AddrWeakObj = LV.getAddress();
+    Address AddrWeakObj = LV.getAddress(*this);
     return RValue::get(CGM.getObjCRuntime().EmitObjCWeakRead(*this,
                                                              AddrWeakObj));
   }
   if (LV.getQuals().getObjCLifetime() == Qualifiers::OCL_Weak) {
     // In MRC mode, we do a load+autorelease.
     if (!getLangOpts().ObjCAutoRefCount) {
-      return RValue::get(EmitARCLoadWeak(LV.getAddress()));
+      return RValue::get(EmitARCLoadWeak(LV.getAddress(*this)));
     }
 
     // In ARC mode, we load retained and then consume the value.
-    llvm::Value *Object = EmitARCLoadWeakRetained(LV.getAddress());
+    llvm::Value *Object = EmitARCLoadWeakRetained(LV.getAddress(*this));
     Object = EmitObjCConsumeObject(LV.getType(), Object);
     return RValue::get(Object);
   }
@@ -1979,9 +1979,10 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
     case Qualifiers::OCL_Weak:
       if (isInit)
         // Initialize and then skip the primitive store.
-        EmitARCInitWeak(Dst.getAddress(), Src.getScalarVal());
+        EmitARCInitWeak(Dst.getAddress(*this), Src.getScalarVal());
       else
-        EmitARCStoreWeak(Dst.getAddress(), Src.getScalarVal(), /*ignore*/ true);
+        EmitARCStoreWeak(Dst.getAddress(*this), Src.getScalarVal(),
+                         /*ignore*/ true);
       return;
 
     case Qualifiers::OCL_Autoreleasing:
@@ -1994,7 +1995,7 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
 
   if (Dst.isObjCWeak() && !Dst.isNonGC()) {
     // load of a __weak object.
-    Address LvalueDst = Dst.getAddress();
+    Address LvalueDst = Dst.getAddress(*this);
     llvm::Value *src = Src.getScalarVal();
      CGM.getObjCRuntime().EmitObjCWeakAssign(*this, src, LvalueDst);
     return;
@@ -2002,7 +2003,7 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
 
   if (Dst.isObjCStrong() && !Dst.isNonGC()) {
     // load of a __strong object.
-    Address LvalueDst = Dst.getAddress();
+    Address LvalueDst = Dst.getAddress(*this);
     llvm::Value *src = Src.getScalarVal();
     if (Dst.isObjCIvar()) {
       assert(Dst.getBaseIvarExp() && "BaseIvarExp is NULL");
@@ -2328,8 +2329,8 @@ Address
 CodeGenFunction::EmitLoadOfReference(LValue RefLVal,
                                      LValueBaseInfo *PointeeBaseInfo,
                                      TBAAAccessInfo *PointeeTBAAInfo) {
-  llvm::LoadInst *Load = Builder.CreateLoad(RefLVal.getAddress(),
-                                            RefLVal.isVolatile());
+  llvm::LoadInst *Load =
+      Builder.CreateLoad(RefLVal.getAddress(*this), RefLVal.isVolatile());
   CGM.DecorateInstructionWithTBAA(Load, RefLVal.getTBAAInfo());
 
   CharUnits Align = getNaturalTypeAlignment(RefLVal.getType()->getPointeeType(),
@@ -2585,7 +2586,7 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
             EmitCapturedFieldLValue(*this, CapturedStmtInfo->lookup(VD),
                                     CapturedStmtInfo->getContextValue());
         return MakeAddrLValue(
-            Address(CapLVal.getPointer(), getContext().getDeclAlign(VD)),
+            Address(CapLVal.getPointer(*this), getContext().getDeclAlign(VD)),
             CapLVal.getType(), LValueBaseInfo(AlignmentSource::Decl),
             CapLVal.getTBAAInfo());
       }
@@ -2720,7 +2721,7 @@ LValue CodeGenFunction::EmitUnaryOpLValue(const UnaryOperator *E) {
     // __real is valid on scalars.  This is a faster way of testing that.
     // __imag can only produce an rvalue on scalars.
     if (E->getOpcode() == UO_Real &&
-        !LV.getAddress().getElementType()->isStructTy()) {
+        !LV.getAddress(*this).getElementType()->isStructTy()) {
       assert(E->getSubExpr()->getType()->isArithmeticType());
       return LV;
     }
@@ -2728,9 +2729,9 @@ LValue CodeGenFunction::EmitUnaryOpLValue(const UnaryOperator *E) {
     QualType T = ExprTy->castAs<ComplexType>()->getElementType();
 
     Address Component =
-      (E->getOpcode() == UO_Real
-         ? emitAddrOfRealComponent(LV.getAddress(), LV.getType())
-         : emitAddrOfImagComponent(LV.getAddress(), LV.getType()));
+        (E->getOpcode() == UO_Real
+             ? emitAddrOfRealComponent(LV.getAddress(*this), LV.getType())
+             : emitAddrOfImagComponent(LV.getAddress(*this), LV.getType()));
     LValue ElemLV = MakeAddrLValue(Component, T, LV.getBaseInfo(),
                                    CGM.getTBAAInfoForSubobject(LV, T));
     ElemLV.getQuals().addQualifiers(LV.getQuals());
@@ -3330,7 +3331,7 @@ Address CodeGenFunction::EmitArrayToPointerDecay(const Expr *E,
 
   // Expressions of array type can't be bitfields or vector elements.
   LValue LV = EmitLValue(E);
-  Address Addr = LV.getAddress();
+  Address Addr = LV.getAddress(*this);
 
   // If the array type was an incomplete type, we need to make sure
   // the decay ends up being the right type.
@@ -3545,8 +3546,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
     LValue LHS = EmitLValue(E->getBase());
     auto *Idx = EmitIdxAfterBase(/*Promote*/false);
     assert(LHS.isSimple() && "Can only subscript lvalue vectors here!");
-    return LValue::MakeVectorElt(LHS.getAddress(), Idx, E->getBase()->getType(),
-                                 LHS.getBaseInfo(), TBAAAccessInfo());
+    return LValue::MakeVectorElt(LHS.getAddress(*this), Idx,
+                                 E->getBase()->getType(), LHS.getBaseInfo(),
+                                 TBAAAccessInfo());
   }
 
   // All the other cases basically behave like simple offsetting.
@@ -3647,7 +3649,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
     // Propagate the alignment from the array itself to the result.
     QualType arrayType = Array->getType();
     Addr = emitArraySubscriptGEP(
-        *this, ArrayLV.getAddress(), {CGM.getSize(CharUnits::Zero()), Idx},
+        *this, ArrayLV.getAddress(*this), {CGM.getSize(CharUnits::Zero()), Idx},
         E->getType(), !getLangOpts().isSignedOverflowDefined(), SignedIndices,
         E->getExprLoc(), &arrayType, E->getBase(), "arrayidx", ArrayDecl);
     EltBaseInfo = ArrayLV.getBaseInfo();
@@ -3682,7 +3684,7 @@ static Address emitOMPArraySectionBase(CodeGenFunction &CGF, const Expr *Base,
   if (auto *ASE = dyn_cast<OMPArraySectionExpr>(Base->IgnoreParenImpCasts())) {
     BaseLVal = CGF.EmitOMPArraySectionExpr(ASE, IsLowerBound);
     if (BaseTy->isArrayType()) {
-      Address Addr = BaseLVal.getAddress();
+      Address Addr = BaseLVal.getAddress(CGF);
       BaseInfo = BaseLVal.getBaseInfo();
 
       // If the array type was an incomplete type, we need to make sure
@@ -3707,7 +3709,7 @@ static Address emitOMPArraySectionBase(CodeGenFunction &CGF, const Expr *Base,
                                                   &TypeTBAAInfo);
     BaseInfo.mergeForCast(TypeBaseInfo);
     TBAAInfo = CGF.CGM.mergeTBAAInfoForCast(TBAAInfo, TypeTBAAInfo);
-    return Address(CGF.Builder.CreateLoad(BaseLVal.getAddress()), Align);
+    return Address(CGF.Builder.CreateLoad(BaseLVal.getAddress(CGF)), Align);
   }
   return CGF.EmitPointerWithAlignment(Base, &BaseInfo, &TBAAInfo);
 }
@@ -3848,7 +3850,7 @@ LValue CodeGenFunction::EmitOMPArraySectionExpr(const OMPArraySectionExpr *E,
 
     // Propagate the alignment from the array itself to the result.
     EltPtr = emitArraySubscriptGEP(
-        *this, ArrayLV.getAddress(), {CGM.getSize(CharUnits::Zero()), Idx},
+        *this, ArrayLV.getAddress(*this), {CGM.getSize(CharUnits::Zero()), Idx},
         ResultExprTy, !getLangOpts().isSignedOverflowDefined(),
         /*signedIndices=*/false, E->getExprLoc());
     BaseInfo = ArrayLV.getBaseInfo();
@@ -3908,7 +3910,7 @@ EmitExtVectorElementExpr(const ExtVectorElementExpr *E) {
   if (Base.isSimple()) {
     llvm::Constant *CV =
         llvm::ConstantDataVector::get(getLLVMContext(), Indices);
-    return LValue::MakeExtVectorElt(Base.getAddress(), CV, type,
+    return LValue::MakeExtVectorElt(Base.getAddress(*this), CV, type,
                                     Base.getBaseInfo(), TBAAAccessInfo());
   }
   assert(Base.isExtVectorElt() && "Can only subscript lvalue vec elts here!");
@@ -4059,7 +4061,7 @@ LValue CodeGenFunction::EmitLValueForField(LValue base,
     const CGRecordLayout &RL =
       CGM.getTypes().getCGRecordLayout(field->getParent());
     const CGBitFieldInfo &Info = RL.getBitFieldInfo(field);
-    Address Addr = base.getAddress();
+    Address Addr = base.getAddress(*this);
     unsigned Idx = RL.getLLVMFieldNo(field);
     const RecordDecl *rec = field->getParent();
     if (!IsInPreservedAIRegion &&
@@ -4127,7 +4129,7 @@ LValue CodeGenFunction::EmitLValueForField(LValue base,
         getContext().getTypeSizeInChars(FieldType).getQuantity();
   }
 
-  Address addr = base.getAddress();
+  Address addr = base.getAddress(*this);
   if (auto *ClassDef = dyn_cast<CXXRecordDecl>(rec)) {
     if (CGM.getCodeGenOpts().StrictVTablePointers &&
         ClassDef->isDynamicClass()) {
@@ -4223,7 +4225,7 @@ CodeGenFunction::EmitLValueForFieldInitialization(LValue Base,
   if (!FieldType->isReferenceType())
     return EmitLValueForField(Base, Field);
 
-  Address V = emitAddrOfFieldStorage(*this, Base.getAddress(), Field);
+  Address V = emitAddrOfFieldStorage(*this, Base.getAddress(*this), Field);
 
   // Make sure that the address is pointing to the right type.
   llvm::Type *llvmType = ConvertTypeForMem(FieldType);
@@ -4341,8 +4343,8 @@ EmitConditionalOperatorLValue(const AbstractConditionalOperator *expr) {
   EmitBlock(contBlock);
 
   if (lhs && rhs) {
-    llvm::Value *lhsPtr = lhs->getPointer();
-    llvm::Value *rhsPtr = rhs->getPointer();
+    llvm::Value *lhsPtr = lhs->getPointer(*this);
+    llvm::Value *rhsPtr = rhs->getPointer(*this);
     if (rhsPtr->getType() != lhsPtr->getType()) {
       if (!getLangOpts().SYCLIsDevice)
         llvm_unreachable(
@@ -4372,8 +4374,8 @@ EmitConditionalOperatorLValue(const AbstractConditionalOperator *expr) {
     phi->addIncoming(rhsPtr, rhsBlock);
     Address result(phi, std::min(lhs->getAlignment(), rhs->getAlignment()));
     AlignmentSource alignSource =
-      std::max(lhs->getBaseInfo().getAlignmentSource(),
-               rhs->getBaseInfo().getAlignmentSource());
+        std::max(lhs->getBaseInfo().getAlignmentSource(),
+                 rhs->getBaseInfo().getAlignmentSource());
     TBAAAccessInfo TBAAInfo = CGM.mergeTBAAInfoForConditionalOperator(
         lhs->getTBAAInfo(), rhs->getTBAAInfo());
     return MakeAddrLValue(result, expr->getType(), LValueBaseInfo(alignSource),
@@ -4452,7 +4454,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
 
   case CK_Dynamic: {
     LValue LV = EmitLValue(E->getSubExpr());
-    Address V = LV.getAddress();
+    Address V = LV.getAddress(*this);
     const auto *DCE = cast<CXXDynamicCastExpr>(E);
     return MakeNaturalAlignAddrLValue(EmitDynamicCast(V, DCE), E->getType());
   }
@@ -4472,7 +4474,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
     auto *DerivedClassDecl = cast<CXXRecordDecl>(DerivedClassTy->getDecl());
 
     LValue LV = EmitLValue(E->getSubExpr());
-    Address This = LV.getAddress();
+    Address This = LV.getAddress(*this);
 
     // Perform the derived-to-base conversion
     Address Base = GetAddressOfBaseClass(
@@ -4494,10 +4496,9 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
     LValue LV = EmitLValue(E->getSubExpr());
 
     // Perform the base-to-derived conversion
-    Address Derived =
-      GetAddressOfDerivedClass(LV.getAddress(), DerivedClassDecl,
-                               E->path_begin(), E->path_end(),
-                               /*NullCheckValue=*/false);
+    Address Derived = GetAddressOfDerivedClass(
+        LV.getAddress(*this), DerivedClassDecl, E->path_begin(), E->path_end(),
+        /*NullCheckValue=*/false);
 
     // C++11 [expr.static.cast]p2: Behavior is undefined if a downcast is
     // performed and the object is not of the derived type.
@@ -4519,7 +4520,7 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
 
     CGM.EmitExplicitCastExprType(CE, this);
     LValue LV = EmitLValue(E->getSubExpr());
-    Address V = Builder.CreateBitCast(LV.getAddress(),
+    Address V = Builder.CreateBitCast(LV.getAddress(*this),
                                       ConvertType(CE->getTypeAsWritten()));
 
     if (SanOpts.has(SanitizerKind::CFIUnrelatedCast))
@@ -4534,14 +4535,15 @@ LValue CodeGenFunction::EmitCastLValue(const CastExpr *E) {
     LValue LV = EmitLValue(E->getSubExpr());
     QualType DestTy = getContext().getPointerType(E->getType());
     llvm::Value *V = getTargetHooks().performAddrSpaceCast(
-        *this, LV.getPointer(), E->getSubExpr()->getType().getAddressSpace(),
+        *this, LV.getPointer(*this),
+        E->getSubExpr()->getType().getAddressSpace(),
         E->getType().getAddressSpace(), ConvertType(DestTy));
-    return MakeAddrLValue(Address(V, LV.getAddress().getAlignment()),
+    return MakeAddrLValue(Address(V, LV.getAddress(*this).getAlignment()),
                           E->getType(), LV.getBaseInfo(), LV.getTBAAInfo());
   }
   case CK_ObjCObjectLValueCast: {
     LValue LV = EmitLValue(E->getSubExpr());
-    Address V = Builder.CreateElementBitCast(LV.getAddress(),
+    Address V = Builder.CreateElementBitCast(LV.getAddress(*this),
                                              ConvertType(E->getType()));
     return MakeAddrLValue(V, E->getType(), LV.getBaseInfo(),
                           CGM.getTBAAInfoForSubobject(LV, E->getType()));
@@ -4595,13 +4597,17 @@ RValue CodeGenFunction::EmitRValueForField(LValue LV,
   case TEK_Complex:
     return RValue::getComplex(EmitLoadOfComplex(FieldLV, Loc));
   case TEK_Aggregate:
-    return FieldLV.asAggregateRValue();
+    return FieldLV.asAggregateRValue(*this);
   case TEK_Scalar:
     // This routine is used to load fields one-by-one to perform a copy, so
     // don't load reference fields.
     if (FD->getType()->isReferenceType())
-      return RValue::get(FieldLV.getPointer());
-    return EmitLoadOfLValue(FieldLV, Loc);
+      return RValue::get(FieldLV.getPointer(*this));
+    // Call EmitLoadOfScalar except when the lvalue is a bitfield to emit a
+    // primitive load.
+    if (FieldLV.isBitField())
+      return EmitLoadOfLValue(FieldLV, Loc);
+    return RValue::get(EmitLoadOfScalar(FieldLV, Loc));
   }
   llvm_unreachable("bad evaluation kind");
 }
@@ -4695,7 +4701,7 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) {
     functionType = ptrType->getPointeeType();
   } else {
     functionType = E->getType();
-    calleePtr = EmitLValue(E).getPointer();
+    calleePtr = EmitLValue(E).getPointer(*this);
   }
   assert(functionType->isFunctionType());
 
@@ -4855,7 +4861,7 @@ LValue CodeGenFunction::EmitObjCIvarRefLValue(const ObjCIvarRefExpr *E) {
     BaseQuals = ObjectTy.getQualifiers();
   } else {
     LValue BaseLV = EmitLValue(BaseExpr);
-    BaseValue = BaseLV.getPointer();
+    BaseValue = BaseLV.getPointer(*this);
     ObjectTy = BaseExpr->getType();
     BaseQuals = ObjectTy.getQualifiers();
   }
@@ -5065,7 +5071,7 @@ EmitPointerToDataMemberBinaryExpr(const BinaryOperator *E) {
   if (E->getOpcode() == BO_PtrMemI) {
     BaseAddr = EmitPointerWithAlignment(E->getLHS());
   } else {
-    BaseAddr = EmitLValue(E->getLHS()).getAddress();
+    BaseAddr = EmitLValue(E->getLHS()).getAddress(*this);
   }
 
   llvm::Value *OffsetV = EmitScalarExpr(E->getRHS());
@@ -5092,7 +5098,7 @@ RValue CodeGenFunction::convertTempToRValue(Address addr,
   case TEK_Complex:
     return RValue::getComplex(EmitLoadOfComplex(lvalue, loc));
   case TEK_Aggregate:
-    return lvalue.asAggregateRValue();
+    return lvalue.asAggregateRValue(*this);
   case TEK_Scalar:
     return RValue::get(EmitLoadOfScalar(lvalue, loc));
   }
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 7e69f63fe1354..41a9329386559 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -345,10 +345,9 @@ void AggExprEmitter::EmitFinalDestCopy(QualType type, const LValue &src,
     }
   }
 
-  AggValueSlot srcAgg =
-    AggValueSlot::forLValue(src, AggValueSlot::IsDestructed,
-                            needsGC(type), AggValueSlot::IsAliased,
-                            AggValueSlot::MayOverlap);
+  AggValueSlot srcAgg = AggValueSlot::forLValue(
+      src, CGF, AggValueSlot::IsDestructed, needsGC(type),
+      AggValueSlot::IsAliased, AggValueSlot::MayOverlap);
   EmitCopy(type, Dest, srcAgg);
 }
 
@@ -386,7 +385,7 @@ AggExprEmitter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E) {
   ASTContext &Ctx = CGF.getContext();
   LValue Array = CGF.EmitLValue(E->getSubExpr());
   assert(Array.isSimple() && "initializer_list array not a simple lvalue");
-  Address ArrayPtr = Array.getAddress();
+  Address ArrayPtr = Array.getAddress(CGF);
 
   const ConstantArrayType *ArrayType =
       Ctx.getAsConstantArrayType(E->getSubExpr()->getType());
@@ -493,7 +492,7 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   if (NumInitElements * elementSize.getQuantity() > 16 &&
       elementType.isTriviallyCopyableType(CGF.getContext())) {
     CodeGen::CodeGenModule &CGM = CGF.CGM;
-    ConstantEmitter Emitter(CGM);
+    ConstantEmitter Emitter(CGF);
     LangAS AS = ArrayQTy.getAddressSpace();
     if (llvm::Constant *C = Emitter.tryEmitForInitializer(E, AS, ArrayQTy)) {
       auto GV = new llvm::GlobalVariable(
@@ -688,7 +687,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
                                       CodeGenFunction::TCK_Load);
     // FIXME: Do we also need to handle property references here?
     if (LV.isSimple())
-      CGF.EmitDynamicCast(LV.getAddress(), cast<CXXDynamicCastExpr>(E));
+      CGF.EmitDynamicCast(LV.getAddress(CGF), cast<CXXDynamicCastExpr>(E));
     else
       CGF.CGM.ErrorUnsupported(E, "non-simple lvalue dynamic_cast");
 
@@ -723,7 +722,7 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) {
 
     LValue SourceLV = CGF.EmitLValue(E->getSubExpr());
     Address SourceAddress =
-        Builder.CreateElementBitCast(SourceLV.getAddress(), CGF.Int8Ty);
+        Builder.CreateElementBitCast(SourceLV.getAddress(CGF), CGF.Int8Ty);
     Address DestAddress =
         Builder.CreateElementBitCast(Dest.getAddress(), CGF.Int8Ty);
     llvm::Value *SizeVal = llvm::ConstantInt::get(
@@ -1163,7 +1162,7 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) {
     }
 
     EmitCopy(E->getLHS()->getType(),
-             AggValueSlot::forLValue(LHS, AggValueSlot::IsDestructed,
+             AggValueSlot::forLValue(LHS, CGF, AggValueSlot::IsDestructed,
                                      needsGC(E->getLHS()->getType()),
                                      AggValueSlot::IsAliased,
                                      AggValueSlot::MayOverlap),
@@ -1184,11 +1183,9 @@ void AggExprEmitter::VisitBinAssign(const BinaryOperator *E) {
   }
 
   // Codegen the RHS so that it stores directly into the LHS.
-  AggValueSlot LHSSlot =
-    AggValueSlot::forLValue(LHS, AggValueSlot::IsDestructed,
-                            needsGC(E->getLHS()->getType()),
-                            AggValueSlot::IsAliased,
-                            AggValueSlot::MayOverlap);
+  AggValueSlot LHSSlot = AggValueSlot::forLValue(
+      LHS, CGF, AggValueSlot::IsDestructed, needsGC(E->getLHS()->getType()),
+      AggValueSlot::IsAliased, AggValueSlot::MayOverlap);
   // A non-volatile aggregate destination might have volatile member.
   if (!LHSSlot.isVolatile() &&
       CGF.hasVolatileMember(E->getLHS()->getType()))
@@ -1320,7 +1317,7 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
               llvm::Constant::getNullValue(CGF.Int8PtrTy),
               CharUnits::One()); // placeholder
 
-        CGF.pushDestroy(EHCleanup, LV.getAddress(), CurField->getType(),
+        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(),
                         CGF.getDestroyer(DtorKind), false);
         Cleanups.push_back(CGF.EHStack.stable_begin());
       }
@@ -1408,12 +1405,11 @@ AggExprEmitter::EmitInitializationToLValue(Expr *E, LValue LV) {
     CGF.EmitComplexExprIntoLValue(E, LV, /*isInit*/ true);
     return;
   case TEK_Aggregate:
-    CGF.EmitAggExpr(E, AggValueSlot::forLValue(LV,
-                                               AggValueSlot::IsDestructed,
-                                      AggValueSlot::DoesNotNeedGCBarriers,
-                                               AggValueSlot::IsNotAliased,
-                                               AggValueSlot::MayOverlap,
-                                               Dest.isZeroed()));
+    CGF.EmitAggExpr(
+        E, AggValueSlot::forLValue(LV, CGF, AggValueSlot::IsDestructed,
+                                   AggValueSlot::DoesNotNeedGCBarriers,
+                                   AggValueSlot::IsNotAliased,
+                                   AggValueSlot::MayOverlap, Dest.isZeroed()));
     return;
   case TEK_Scalar:
     if (LV.isSimple()) {
@@ -1449,7 +1445,7 @@ void AggExprEmitter::EmitNullInitializationToLValue(LValue lv) {
     // There's a potential optimization opportunity in combining
     // memsets; that would be easy for arrays, but relatively
     // difficult for structures with the current code.
-    CGF.EmitNullInitialization(lv.getAddress(), lv.getType());
+    CGF.EmitNullInitialization(lv.getAddress(CGF), lv.getType());
   }
 }
 
@@ -1606,7 +1602,7 @@ void AggExprEmitter::VisitInitListExpr(InitListExpr *E) {
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
       if (CGF.needsEHCleanup(dtorKind)) {
-        CGF.pushDestroy(EHCleanup, LV.getAddress(), field->getType(),
+        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(),
                         CGF.getDestroyer(dtorKind), false);
         addCleanup(CGF.EHStack.stable_begin());
         pushedCleanup = true;
@@ -1617,7 +1613,7 @@ void AggExprEmitter::VisitInitListExpr(InitListExpr *E) {
     // else, clean it up for -O0 builds and general tidiness.
     if (!pushedCleanup && LV.isSimple())
       if (llvm::GetElementPtrInst *GEP =
-            dyn_cast<llvm::GetElementPtrInst>(LV.getPointer()))
+              dyn_cast<llvm::GetElementPtrInst>(LV.getPointer(CGF)))
         if (GEP->use_empty())
           GEP->eraseFromParent();
   }
@@ -1699,9 +1695,8 @@ void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E,
     if (InnerLoop) {
       // If the subexpression is an ArrayInitLoopExpr, share its cleanup.
       auto elementSlot = AggValueSlot::forLValue(
-          elementLV, AggValueSlot::IsDestructed,
-          AggValueSlot::DoesNotNeedGCBarriers,
-          AggValueSlot::IsNotAliased,
+          elementLV, CGF, AggValueSlot::IsDestructed,
+          AggValueSlot::DoesNotNeedGCBarriers, AggValueSlot::IsNotAliased,
           AggValueSlot::DoesNotOverlap);
       AggExprEmitter(CGF, elementSlot, false)
           .VisitArrayInitLoopExpr(InnerLoop, outerBegin);
@@ -1864,10 +1859,10 @@ LValue CodeGenFunction::EmitAggExprToLValue(const Expr *E) {
   assert(hasAggregateEvaluationKind(E->getType()) && "Invalid argument!");
   Address Temp = CreateMemTemp(E->getType());
   LValue LV = MakeAddrLValue(Temp, E->getType());
-  EmitAggExpr(E, AggValueSlot::forLValue(LV, AggValueSlot::IsNotDestructed,
-                                         AggValueSlot::DoesNotNeedGCBarriers,
-                                         AggValueSlot::IsNotAliased,
-                                         AggValueSlot::DoesNotOverlap));
+  EmitAggExpr(E, AggValueSlot::forLValue(
+                     LV, *this, AggValueSlot::IsNotDestructed,
+                     AggValueSlot::DoesNotNeedGCBarriers,
+                     AggValueSlot::IsNotAliased, AggValueSlot::DoesNotOverlap));
   return LV;
 }
 
@@ -1916,8 +1911,8 @@ void CodeGenFunction::EmitAggregateCopy(LValue Dest, LValue Src, QualType Ty,
                                         bool isVolatile) {
   assert(!Ty->isAnyComplexType() && "Shouldn't happen for complex");
 
-  Address DestPtr = Dest.getAddress();
-  Address SrcPtr = Src.getAddress();
+  Address DestPtr = Dest.getAddress(*this);
+  Address SrcPtr = Src.getAddress(*this);
 
   if (getLangOpts().CPlusPlus) {
     if (const RecordType *RT = Ty->getAs<RecordType>()) {
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 114d806d454bb..269b80b434032 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -133,7 +133,7 @@ RValue CodeGenFunction::EmitCXXPseudoDestructorExpr(
       BaseQuals = PTy->getPointeeType().getQualifiers();
     } else {
       LValue BaseLV = EmitLValue(BaseExpr);
-      BaseValue = BaseLV.getAddress();
+      BaseValue = BaseLV.getAddress(*this);
       QualType BaseTy = BaseExpr->getType();
       BaseQuals = BaseTy.getQualifiers();
     }
@@ -271,11 +271,11 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
     assert(ReturnValue.isNull() && "Constructor shouldn't have return value");
     CallArgList Args;
     commonEmitCXXMemberOrOperatorCall(
-        *this, Ctor, This.getPointer(), /*ImplicitParam=*/nullptr,
+        *this, Ctor, This.getPointer(*this), /*ImplicitParam=*/nullptr,
         /*ImplicitParamTy=*/QualType(), CE, Args, nullptr);
 
     EmitCXXConstructorCall(Ctor, Ctor_Complete, /*ForVirtualBase=*/false,
-                           /*Delegating=*/false, This.getAddress(), Args,
+                           /*Delegating=*/false, This.getAddress(*this), Args,
                            AggValueSlot::DoesNotOverlap, CE->getExprLoc(),
                            /*NewPointerIsChecked=*/false);
     return RValue::get(nullptr);
@@ -293,7 +293,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
                                (*(CE->arg_begin() + 1))->getType())
                          : EmitLValue(*CE->arg_begin());
         EmitAggregateAssign(This, RHS, CE->getType());
-        return RValue::get(This.getPointer());
+        return RValue::get(This.getPointer(*this));
       }
       llvm_unreachable("unknown trivial member function");
     }
@@ -328,7 +328,8 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
     if (IsImplicitObjectCXXThis || isa<DeclRefExpr>(IOA))
       SkippedChecks.set(SanitizerKind::Null, true);
   }
-  EmitTypeCheck(CodeGenFunction::TCK_MemberCall, CallLoc, This.getPointer(),
+  EmitTypeCheck(CodeGenFunction::TCK_MemberCall, CallLoc,
+                This.getPointer(*this),
                 C.getRecordType(CalleeDecl->getParent()),
                 /*Alignment=*/CharUnits::Zero(), SkippedChecks);
 
@@ -345,9 +346,9 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
            "Destructor shouldn't have explicit parameters");
     assert(ReturnValue.isNull() && "Destructor shouldn't have return value");
     if (UseVirtualCall) {
-      CGM.getCXXABI().EmitVirtualDestructorCall(
-          *this, Dtor, Dtor_Complete, This.getAddress(),
-          cast<CXXMemberCallExpr>(CE));
+      CGM.getCXXABI().EmitVirtualDestructorCall(*this, Dtor, Dtor_Complete,
+                                                This.getAddress(*this),
+                                                cast<CXXMemberCallExpr>(CE));
     } else {
       GlobalDecl GD(Dtor, Dtor_Complete);
       CGCallee Callee;
@@ -362,7 +363,7 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
 
       QualType ThisTy =
           IsArrow ? Base->getType()->getPointeeType() : Base->getType();
-      EmitCXXDestructorCall(GD, Callee, This.getPointer(), ThisTy,
+      EmitCXXDestructorCall(GD, Callee, This.getPointer(*this), ThisTy,
                             /*ImplicitParam=*/nullptr,
                             /*ImplicitParamTy=*/QualType(), nullptr);
     }
@@ -374,15 +375,14 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
 
   CGCallee Callee;
   if (UseVirtualCall) {
-    Callee = CGCallee::forVirtual(CE, MD, This.getAddress(), Ty);
+    Callee = CGCallee::forVirtual(CE, MD, This.getAddress(*this), Ty);
   } else {
     if (SanOpts.has(SanitizerKind::CFINVCall) &&
         MD->getParent()->isDynamicClass()) {
       llvm::Value *VTable;
       const CXXRecordDecl *RD;
-      std::tie(VTable, RD) =
-          CGM.getCXXABI().LoadVTablePtr(*this, This.getAddress(),
-                                        CalleeDecl->getParent());
+      std::tie(VTable, RD) = CGM.getCXXABI().LoadVTablePtr(
+          *this, This.getAddress(*this), CalleeDecl->getParent());
       EmitVTablePtrCheckForCall(RD, VTable, CFITCK_NVCall, CE->getBeginLoc());
     }
 
@@ -401,12 +401,12 @@ RValue CodeGenFunction::EmitCXXMemberOrOperatorMemberCallExpr(
   if (MD->isVirtual()) {
     Address NewThisAddr =
         CGM.getCXXABI().adjustThisArgumentForVirtualFunctionCall(
-            *this, CalleeDecl, This.getAddress(), UseVirtualCall);
+            *this, CalleeDecl, This.getAddress(*this), UseVirtualCall);
     This.setAddress(NewThisAddr);
   }
 
   return EmitCXXMemberOrOperatorCall(
-      CalleeDecl, Callee, ReturnValue, This.getPointer(),
+      CalleeDecl, Callee, ReturnValue, This.getPointer(*this),
       /*ImplicitParam=*/nullptr, QualType(), CE, RtlArgs);
 }
 
@@ -428,7 +428,7 @@ CodeGenFunction::EmitCXXMemberPointerCallExpr(const CXXMemberCallExpr *E,
   if (BO->getOpcode() == BO_PtrMemI)
     This = EmitPointerWithAlignment(BaseExpr);
   else
-    This = EmitLValue(BaseExpr).getAddress();
+    This = EmitLValue(BaseExpr).getAddress(*this);
 
   EmitTypeCheck(TCK_MemberCall, E->getExprLoc(), This.getPointer(),
                 QualType(MPT->getClass(), 0));
@@ -2103,7 +2103,7 @@ static bool isGLValueFromPointerDeref(const Expr *E) {
 static llvm::Value *EmitTypeidFromVTable(CodeGenFunction &CGF, const Expr *E,
                                          llvm::Type *StdTypeInfoPtrTy) {
   // Get the vtable pointer.
-  Address ThisPtr = CGF.EmitLValue(E).getAddress();
+  Address ThisPtr = CGF.EmitLValue(E).getAddress(CGF);
 
   QualType SrcRecordTy = E->getType();
 
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 385f87f12a9b3..6b11969771567 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -348,7 +348,7 @@ ComplexPairTy ComplexExprEmitter::EmitLoadOfLValue(LValue lvalue,
   if (lvalue.getType()->isAtomicType())
     return CGF.EmitAtomicLoad(lvalue, loc).getComplexVal();
 
-  Address SrcPtr = lvalue.getAddress();
+  Address SrcPtr = lvalue.getAddress(CGF);
   bool isVolatile = lvalue.isVolatileQualified();
 
   llvm::Value *Real = nullptr, *Imag = nullptr;
@@ -374,7 +374,7 @@ void ComplexExprEmitter::EmitStoreOfComplex(ComplexPairTy Val, LValue lvalue,
       (!isInit && CGF.LValueIsSuitableForInlineAtomic(lvalue)))
     return CGF.EmitAtomicStore(RValue::getComplex(Val), lvalue, isInit);
 
-  Address Ptr = lvalue.getAddress();
+  Address Ptr = lvalue.getAddress(CGF);
   Address RealPtr = CGF.emitAddrOfRealComponent(Ptr, lvalue.getType());
   Address ImagPtr = CGF.emitAddrOfImagComponent(Ptr, lvalue.getType());
 
@@ -463,14 +463,14 @@ ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
 
   case CK_LValueBitCast: {
     LValue origLV = CGF.EmitLValue(Op);
-    Address V = origLV.getAddress();
+    Address V = origLV.getAddress(CGF);
     V = Builder.CreateElementBitCast(V, CGF.ConvertType(DestTy));
     return EmitLoadOfLValue(CGF.MakeAddrLValue(V, DestTy), Op->getExprLoc());
   }
 
   case CK_LValueToRValueBitCast: {
     LValue SourceLVal = CGF.EmitLValue(Op);
-    Address Addr = Builder.CreateElementBitCast(SourceLVal.getAddress(),
+    Address Addr = Builder.CreateElementBitCast(SourceLVal.getAddress(CGF),
                                                 CGF.ConvertTypeForMem(DestTy));
     LValue DestLV = CGF.MakeAddrLValue(Addr, DestTy);
     DestLV.setTBAAInfo(TBAAAccessInfo::getMayAliasInfo());
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 69df9e4103b14..84dbb55be3e1e 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -615,7 +615,7 @@ class ScalarExprEmitter
     if (isa<MemberPointerType>(E->getType())) // never sugared
       return CGF.CGM.getMemberPointerConstant(E);
 
-    return EmitLValue(E->getSubExpr()).getPointer();
+    return EmitLValue(E->getSubExpr()).getPointer(CGF);
   }
   Value *VisitUnaryDeref(const UnaryOperator *E) {
     if (E->getType()->isVoidType())
@@ -644,8 +644,8 @@ class ScalarExprEmitter
     auto &Ctx = CGF.getContext();
     APValue Evaluated =
         SLE->EvaluateInContext(Ctx, CGF.CurSourceLocExprScope.getDefaultExpr());
-    return ConstantEmitter(CGF.CGM, &CGF)
-        .emitAbstract(SLE->getLocation(), Evaluated, SLE->getType());
+    return ConstantEmitter(CGF).emitAbstract(SLE->getLocation(), Evaluated,
+                                             SLE->getType());
   }
 
   Value *VisitCXXDefaultArgExpr(CXXDefaultArgExpr *DAE) {
@@ -976,6 +976,11 @@ EmitIntegerTruncationCheckHelper(Value *Src, QualType SrcType, Value *Dst,
   return std::make_pair(Kind, std::make_pair(Check, Mask));
 }
 
+static bool PromotionIsPotentiallyEligibleForImplicitIntegerConversionCheck(
+    QualType SrcType, QualType DstType) {
+  return SrcType->isIntegerType() && DstType->isIntegerType();
+}
+
 void ScalarExprEmitter::EmitIntegerTruncationCheck(Value *Src, QualType SrcType,
                                                    Value *Dst, QualType DstType,
                                                    SourceLocation Loc) {
@@ -984,7 +989,8 @@ void ScalarExprEmitter::EmitIntegerTruncationCheck(Value *Src, QualType SrcType,
 
   // We only care about int->int conversions here.
   // We ignore conversions to/from pointer and/or bool.
-  if (!(SrcType->isIntegerType() && DstType->isIntegerType()))
+  if (!PromotionIsPotentiallyEligibleForImplicitIntegerConversionCheck(SrcType,
+                                                                       DstType))
     return;
 
   unsigned SrcBits = Src->getType()->getScalarSizeInBits();
@@ -1095,7 +1101,8 @@ void ScalarExprEmitter::EmitIntegerSignChangeCheck(Value *Src, QualType SrcType,
 
   // We only care about int->int conversions here.
   // We ignore conversions to/from pointer and/or bool.
-  if (!(SrcType->isIntegerType() && DstType->isIntegerType()))
+  if (!PromotionIsPotentiallyEligibleForImplicitIntegerConversionCheck(SrcType,
+                                                                       DstType))
     return;
 
   bool SrcSigned = SrcType->isSignedIntegerOrEnumerationType();
@@ -1972,7 +1979,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
 
   case CK_LValueBitCast:
   case CK_ObjCObjectLValueCast: {
-    Address Addr = EmitLValue(E).getAddress();
+    Address Addr = EmitLValue(E).getAddress(CGF);
     Addr = Builder.CreateElementBitCast(Addr, CGF.ConvertTypeForMem(DestTy));
     LValue LV = CGF.MakeAddrLValue(Addr, DestTy);
     return EmitLoadOfLValue(LV, CE->getExprLoc());
@@ -1980,7 +1987,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
 
   case CK_LValueToRValueBitCast: {
     LValue SourceLVal = CGF.EmitLValue(E);
-    Address Addr = Builder.CreateElementBitCast(SourceLVal.getAddress(),
+    Address Addr = Builder.CreateElementBitCast(SourceLVal.getAddress(CGF),
                                                 CGF.ConvertTypeForMem(DestTy));
     LValue DestLV = CGF.MakeAddrLValue(Addr, DestTy);
     DestLV.setTBAAInfo(TBAAAccessInfo::getMayAliasInfo());
@@ -2121,7 +2128,7 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
   case CK_ArrayToPointerDecay:
     return CGF.EmitArrayToPointerDecay(E).getPointer();
   case CK_FunctionToPointerDecay:
-    return EmitLValue(E).getPointer();
+    return EmitLValue(E).getPointer(CGF);
 
   case CK_NullToPointer:
     if (MustVisitNullValue(E))
@@ -2386,14 +2393,14 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     if (isInc && type->isBooleanType()) {
       llvm::Value *True = CGF.EmitToMemory(Builder.getTrue(), type);
       if (isPre) {
-        Builder.CreateStore(True, LV.getAddress(), LV.isVolatileQualified())
-          ->setAtomic(llvm::AtomicOrdering::SequentiallyConsistent);
+        Builder.CreateStore(True, LV.getAddress(CGF), LV.isVolatileQualified())
+            ->setAtomic(llvm::AtomicOrdering::SequentiallyConsistent);
         return Builder.getTrue();
       }
       // For atomic bool increment, we just store true and return it for
       // preincrement, do an atomic swap with true for postincrement
       return Builder.CreateAtomicRMW(
-          llvm::AtomicRMWInst::Xchg, LV.getPointer(), True,
+          llvm::AtomicRMWInst::Xchg, LV.getPointer(CGF), True,
           llvm::AtomicOrdering::SequentiallyConsistent);
     }
     // Special case for atomic increment / decrement on integers, emit
@@ -2410,8 +2417,9 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
         llvm::Instruction::Sub;
       llvm::Value *amt = CGF.EmitToMemory(
           llvm::ConstantInt::get(ConvertType(type), 1, true), type);
-      llvm::Value *old = Builder.CreateAtomicRMW(aop,
-          LV.getPointer(), amt, llvm::AtomicOrdering::SequentiallyConsistent);
+      llvm::Value *old =
+          Builder.CreateAtomicRMW(aop, LV.getPointer(CGF), amt,
+                                  llvm::AtomicOrdering::SequentiallyConsistent);
       return isPre ? Builder.CreateBinOp(op, old, amt) : old;
     }
     value = EmitLoadOfLValue(LV, E->getExprLoc());
@@ -2442,9 +2450,51 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
 
   // Most common case by far: integer increment.
   } else if (type->isIntegerType()) {
-    // Note that signed integer inc/dec with width less than int can't
-    // overflow because of promotion rules; we're just eliding a few steps here.
-    if (E->canOverflow() && type->isSignedIntegerOrEnumerationType()) {
+    QualType promotedType;
+    bool canPerformLossyDemotionCheck = false;
+    if (type->isPromotableIntegerType()) {
+      promotedType = CGF.getContext().getPromotedIntegerType(type);
+      assert(promotedType != type && "Shouldn't promote to the same type.");
+      canPerformLossyDemotionCheck = true;
+      canPerformLossyDemotionCheck &=
+          CGF.getContext().getCanonicalType(type) !=
+          CGF.getContext().getCanonicalType(promotedType);
+      canPerformLossyDemotionCheck &=
+          PromotionIsPotentiallyEligibleForImplicitIntegerConversionCheck(
+              type, promotedType);
+      assert((!canPerformLossyDemotionCheck ||
+              type->isSignedIntegerOrEnumerationType() ||
+              promotedType->isSignedIntegerOrEnumerationType() ||
+              ConvertType(type)->getScalarSizeInBits() ==
+                  ConvertType(promotedType)->getScalarSizeInBits()) &&
+             "The following check expects that if we do promotion to different "
+             "underlying canonical type, at least one of the types (either "
+             "base or promoted) will be signed, or the bitwidths will match.");
+    }
+    if (CGF.SanOpts.hasOneOf(
+            SanitizerKind::ImplicitIntegerArithmeticValueChange) &&
+        canPerformLossyDemotionCheck) {
+      // While `x += 1` (for `x` with width less than int) is modeled as
+      // promotion+arithmetics+demotion, and we can catch lossy demotion with
+      // ease; inc/dec with width less than int can't overflow because of
+      // promotion rules, so we omit promotion+demotion, which means that we can
+      // not catch lossy "demotion". Because we still want to catch these cases
+      // when the sanitizer is enabled, we perform the promotion, then perform
+      // the increment/decrement in the wider type, and finally
+      // perform the demotion. This will catch lossy demotions.
+
+      value = EmitScalarConversion(value, type, promotedType, E->getExprLoc());
+      Value *amt = llvm::ConstantInt::get(value->getType(), amount, true);
+      value = Builder.CreateAdd(value, amt, isInc ? "inc" : "dec");
+      // Do pass non-default ScalarConversionOpts so that sanitizer check is
+      // emitted.
+      value = EmitScalarConversion(value, promotedType, type, E->getExprLoc(),
+                                   ScalarConversionOpts(CGF.SanOpts));
+
+      // Note that signed integer inc/dec with width less than int can't
+      // overflow because of promotion rules; we're just eliding a few steps
+      // here.
+    } else if (E->canOverflow() && type->isSignedIntegerOrEnumerationType()) {
       value = EmitIncDecConsiderOverflowBehavior(E, value, isInc);
     } else if (E->canOverflow() && type->isUnsignedIntegerType() &&
                CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow)) {
@@ -2957,7 +3007,7 @@ LValue ScalarExprEmitter::EmitCompoundAssignLValue(
                                  E->getExprLoc()),
             LHSTy);
         Value *OldVal = Builder.CreateAtomicRMW(
-            AtomicOp, LHSLV.getPointer(), Amt,
+            AtomicOp, LHSLV.getPointer(CGF), Amt,
             llvm::AtomicOrdering::SequentiallyConsistent);
 
         // Since operation is atomic, the result type is guaranteed to be the
@@ -4011,7 +4061,7 @@ Value *ScalarExprEmitter::VisitBinAssign(const BinaryOperator *E) {
   case Qualifiers::OCL_Weak:
     RHS = Visit(E->getRHS());
     LHS = EmitCheckedLValue(E->getLHS(), CodeGenFunction::TCK_Store);
-    RHS = CGF.EmitARCStoreWeak(LHS.getAddress(), RHS, Ignore);
+    RHS = CGF.EmitARCStoreWeak(LHS.getAddress(CGF), RHS, Ignore);
     break;
 
   case Qualifiers::OCL_None:
@@ -4588,7 +4638,7 @@ LValue CodeGenFunction::EmitObjCIsaExpr(const ObjCIsaExpr *E) {
   if (BaseExpr->isRValue()) {
     Addr = Address(EmitScalarExpr(BaseExpr), getPointerAlign());
   } else {
-    Addr = EmitLValue(BaseExpr).getAddress();
+    Addr = EmitLValue(BaseExpr).getAddress(*this);
   }
 
   // Cast the address to Class*.
diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
index 332e51e57ded0..d5f378c522322 100644
--- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp
+++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
@@ -707,7 +707,7 @@ struct GenMoveConstructor : GenBinaryFunc<GenMoveConstructor, true> {
     LValue SrcLV = CGF->MakeAddrLValue(Addrs[SrcIdx], QT);
     llvm::Value *SrcVal =
         CGF->EmitLoadOfLValue(SrcLV, SourceLocation()).getScalarVal();
-    CGF->EmitStoreOfScalar(getNullForVariable(SrcLV.getAddress()), SrcLV);
+    CGF->EmitStoreOfScalar(getNullForVariable(SrcLV.getAddress(*CGF)), SrcLV);
     CGF->EmitStoreOfScalar(SrcVal, CGF->MakeAddrLValue(Addrs[DstIdx], QT),
                            /* isInitialization */ true);
   }
@@ -770,7 +770,7 @@ struct GenMoveAssignment : GenBinaryFunc<GenMoveAssignment, true> {
     LValue SrcLV = CGF->MakeAddrLValue(Addrs[SrcIdx], QT);
     llvm::Value *SrcVal =
         CGF->EmitLoadOfLValue(SrcLV, SourceLocation()).getScalarVal();
-    CGF->EmitStoreOfScalar(getNullForVariable(SrcLV.getAddress()), SrcLV);
+    CGF->EmitStoreOfScalar(getNullForVariable(SrcLV.getAddress(*CGF)), SrcLV);
     LValue DstLV = CGF->MakeAddrLValue(Addrs[DstIdx], QT);
     llvm::Value *DstVal =
         CGF->EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal();
@@ -806,7 +806,8 @@ void CodeGenFunction::destroyNonTrivialCStruct(CodeGenFunction &CGF,
 // such structure.
 void CodeGenFunction::defaultInitNonTrivialCStructVar(LValue Dst) {
   GenDefaultInitialize Gen(getContext());
-  Address DstPtr = Builder.CreateBitCast(Dst.getAddress(), CGM.Int8PtrPtrTy);
+  Address DstPtr =
+      Builder.CreateBitCast(Dst.getAddress(*this), CGM.Int8PtrPtrTy);
   Gen.setCGF(this);
   QualType QT = Dst.getType();
   QT = Dst.isVolatile() ? QT.withVolatile() : QT;
@@ -850,7 +851,7 @@ getSpecialFunction(G &&Gen, StringRef FuncName, QualType QT, bool IsVolatile,
 // Functions to emit calls to the special functions of a non-trivial C struct.
 void CodeGenFunction::callCStructDefaultConstructor(LValue Dst) {
   bool IsVolatile = Dst.isVolatile();
-  Address DstPtr = Dst.getAddress();
+  Address DstPtr = Dst.getAddress(*this);
   QualType QT = Dst.getType();
   GenDefaultInitializeFuncName GenName(DstPtr.getAlignment(), getContext());
   std::string FuncName = GenName.getName(QT, IsVolatile);
@@ -874,7 +875,7 @@ std::string CodeGenFunction::getNonTrivialDestructorStr(QualType QT,
 
 void CodeGenFunction::callCStructDestructor(LValue Dst) {
   bool IsVolatile = Dst.isVolatile();
-  Address DstPtr = Dst.getAddress();
+  Address DstPtr = Dst.getAddress(*this);
   QualType QT = Dst.getType();
   GenDestructorFuncName GenName("__destructor_", DstPtr.getAlignment(),
                                 getContext());
@@ -885,7 +886,7 @@ void CodeGenFunction::callCStructDestructor(LValue Dst) {
 
 void CodeGenFunction::callCStructCopyConstructor(LValue Dst, LValue Src) {
   bool IsVolatile = Dst.isVolatile() || Src.isVolatile();
-  Address DstPtr = Dst.getAddress(), SrcPtr = Src.getAddress();
+  Address DstPtr = Dst.getAddress(*this), SrcPtr = Src.getAddress(*this);
   QualType QT = Dst.getType();
   GenBinaryFuncName<false> GenName("__copy_constructor_", DstPtr.getAlignment(),
                                    SrcPtr.getAlignment(), getContext());
@@ -899,7 +900,7 @@ void CodeGenFunction::callCStructCopyAssignmentOperator(LValue Dst, LValue Src
 
 ) {
   bool IsVolatile = Dst.isVolatile() || Src.isVolatile();
-  Address DstPtr = Dst.getAddress(), SrcPtr = Src.getAddress();
+  Address DstPtr = Dst.getAddress(*this), SrcPtr = Src.getAddress(*this);
   QualType QT = Dst.getType();
   GenBinaryFuncName<false> GenName("__copy_assignment_", DstPtr.getAlignment(),
                                    SrcPtr.getAlignment(), getContext());
@@ -910,7 +911,7 @@ void CodeGenFunction::callCStructCopyAssignmentOperator(LValue Dst, LValue Src
 
 void CodeGenFunction::callCStructMoveConstructor(LValue Dst, LValue Src) {
   bool IsVolatile = Dst.isVolatile() || Src.isVolatile();
-  Address DstPtr = Dst.getAddress(), SrcPtr = Src.getAddress();
+  Address DstPtr = Dst.getAddress(*this), SrcPtr = Src.getAddress(*this);
   QualType QT = Dst.getType();
   GenBinaryFuncName<true> GenName("__move_constructor_", DstPtr.getAlignment(),
                                   SrcPtr.getAlignment(), getContext());
@@ -924,7 +925,7 @@ void CodeGenFunction::callCStructMoveAssignmentOperator(LValue Dst, LValue Src
 
 ) {
   bool IsVolatile = Dst.isVolatile() || Src.isVolatile();
-  Address DstPtr = Dst.getAddress(), SrcPtr = Src.getAddress();
+  Address DstPtr = Dst.getAddress(*this), SrcPtr = Src.getAddress(*this);
   QualType QT = Dst.getType();
   GenBinaryFuncName<true> GenName("__move_assignment_", DstPtr.getAlignment(),
                                   SrcPtr.getAlignment(), getContext());
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index 984fa599a99f3..14391f3b129a1 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -511,7 +511,7 @@ RValue CodeGenFunction::EmitObjCMessageExpr(const ObjCMessageExpr *E,
       method->getMethodFamily() == OMF_retain) {
     if (auto lvalueExpr = findWeakLValue(E->getInstanceReceiver())) {
       LValue lvalue = EmitLValue(lvalueExpr);
-      llvm::Value *result = EmitARCLoadWeakRetained(lvalue.getAddress());
+      llvm::Value *result = EmitARCLoadWeakRetained(lvalue.getAddress(*this));
       return AdjustObjCObjectType(*this, E->getType(), RValue::get(result));
     }
   }
@@ -749,8 +749,8 @@ static void emitStructGetterCall(CodeGenFunction &CGF, ObjCIvarDecl *ivar,
   ASTContext &Context = CGF.getContext();
 
   Address src =
-    CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), CGF.LoadObjCSelf(), ivar, 0)
-       .getAddress();
+      CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), CGF.LoadObjCSelf(), ivar, 0)
+          .getAddress(CGF);
 
   // objc_copyStruct (ReturnValue, &structIvar,
   //                  sizeof (Type of Ivar), isAtomic, false);
@@ -1022,8 +1022,8 @@ static void emitCPPObjectAtomicGetterCall(CodeGenFunction &CGF,
 
   // The 2nd argument is the address of the ivar.
   llvm::Value *ivarAddr =
-    CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(),
-                          CGF.LoadObjCSelf(), ivar, 0).getPointer();
+      CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), CGF.LoadObjCSelf(), ivar, 0)
+          .getPointer(CGF);
   ivarAddr = CGF.Builder.CreateBitCast(ivarAddr, CGF.Int8PtrTy);
   args.add(RValue::get(ivarAddr), CGF.getContext().VoidPtrTy);
 
@@ -1082,7 +1082,7 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
     bitcastType = bitcastType->getPointerTo(); // addrspace 0 okay
 
     // Perform an atomic load.  This does not impose ordering constraints.
-    Address ivarAddr = LV.getAddress();
+    Address ivarAddr = LV.getAddress(*this);
     ivarAddr = Builder.CreateBitCast(ivarAddr, bitcastType);
     llvm::LoadInst *load = Builder.CreateLoad(ivarAddr, "load");
     load->setAtomic(llvm::AtomicOrdering::Unordered);
@@ -1183,14 +1183,14 @@ CodeGenFunction::generateObjCGetterBody(const ObjCImplementationDecl *classImpl,
     case TEK_Scalar: {
       llvm::Value *value;
       if (propType->isReferenceType()) {
-        value = LV.getAddress().getPointer();
+        value = LV.getAddress(*this).getPointer();
       } else {
         // We want to load and autoreleaseReturnValue ARC __weak ivars.
         if (LV.getQuals().getObjCLifetime() == Qualifiers::OCL_Weak) {
           if (getLangOpts().ObjCAutoRefCount) {
             value = emitARCRetainLoadOfScalar(*this, LV, ivarType);
           } else {
-            value = EmitARCLoadWeak(LV.getAddress());
+            value = EmitARCLoadWeak(LV.getAddress(*this));
           }
 
         // Otherwise we want to do a simple load, suppressing the
@@ -1224,9 +1224,9 @@ static void emitStructSetterCall(CodeGenFunction &CGF, ObjCMethodDecl *OMD,
   CallArgList args;
 
   // The first argument is the address of the ivar.
-  llvm::Value *ivarAddr = CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(),
-                                                CGF.LoadObjCSelf(), ivar, 0)
-    .getPointer();
+  llvm::Value *ivarAddr =
+      CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), CGF.LoadObjCSelf(), ivar, 0)
+          .getPointer(CGF);
   ivarAddr = CGF.Builder.CreateBitCast(ivarAddr, CGF.Int8PtrTy);
   args.add(RValue::get(ivarAddr), CGF.getContext().VoidPtrTy);
 
@@ -1235,7 +1235,7 @@ static void emitStructSetterCall(CodeGenFunction &CGF, ObjCMethodDecl *OMD,
   DeclRefExpr argRef(CGF.getContext(), argVar, false,
                      argVar->getType().getNonReferenceType(), VK_LValue,
                      SourceLocation());
-  llvm::Value *argAddr = CGF.EmitLValue(&argRef).getPointer();
+  llvm::Value *argAddr = CGF.EmitLValue(&argRef).getPointer(CGF);
   argAddr = CGF.Builder.CreateBitCast(argAddr, CGF.Int8PtrTy);
   args.add(RValue::get(argAddr), CGF.getContext().VoidPtrTy);
 
@@ -1271,8 +1271,8 @@ static void emitCPPObjectAtomicSetterCall(CodeGenFunction &CGF,
 
   // The first argument is the address of the ivar.
   llvm::Value *ivarAddr =
-    CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(),
-                          CGF.LoadObjCSelf(), ivar, 0).getPointer();
+      CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), CGF.LoadObjCSelf(), ivar, 0)
+          .getPointer(CGF);
   ivarAddr = CGF.Builder.CreateBitCast(ivarAddr, CGF.Int8PtrTy);
   args.add(RValue::get(ivarAddr), CGF.getContext().VoidPtrTy);
 
@@ -1281,7 +1281,7 @@ static void emitCPPObjectAtomicSetterCall(CodeGenFunction &CGF,
   DeclRefExpr argRef(CGF.getContext(), argVar, false,
                      argVar->getType().getNonReferenceType(), VK_LValue,
                      SourceLocation());
-  llvm::Value *argAddr = CGF.EmitLValue(&argRef).getPointer();
+  llvm::Value *argAddr = CGF.EmitLValue(&argRef).getPointer(CGF);
   argAddr = CGF.Builder.CreateBitCast(argAddr, CGF.Int8PtrTy);
   args.add(RValue::get(argAddr), CGF.getContext().VoidPtrTy);
 
@@ -1358,7 +1358,7 @@ CodeGenFunction::generateObjCSetterBody(const ObjCImplementationDecl *classImpl,
 
     LValue ivarLValue =
       EmitLValueForIvar(TypeOfSelfObject(), LoadObjCSelf(), ivar, /*quals*/ 0);
-    Address ivarAddr = ivarLValue.getAddress();
+    Address ivarAddr = ivarLValue.getAddress(*this);
 
     // Currently, all atomic accesses have to be through integer
     // types, so there's no point in trying to pick a prettier type.
@@ -1535,7 +1535,7 @@ namespace {
     void Emit(CodeGenFunction &CGF, Flags flags) override {
       LValue lvalue
         = CGF.EmitLValueForIvar(CGF.TypeOfSelfObject(), addr, ivar, /*CVR*/ 0);
-      CGF.emitDestroy(lvalue.getAddress(), ivar->getType(), destroyer,
+      CGF.emitDestroy(lvalue.getAddress(CGF), ivar->getType(), destroyer,
                       flags.isForNormalCleanup() && useEHCleanupForArray);
     }
   };
@@ -1602,7 +1602,7 @@ void CodeGenFunction::GenerateObjCCtorDtorMethod(ObjCImplementationDecl *IMP,
       LValue LV = EmitLValueForIvar(TypeOfSelfObject(),
                                     LoadObjCSelf(), Ivar, 0);
       EmitAggExpr(IvarInit->getInit(),
-                  AggValueSlot::forLValue(LV, AggValueSlot::IsDestructed,
+                  AggValueSlot::forLValue(LV, *this, AggValueSlot::IsDestructed,
                                           AggValueSlot::DoesNotNeedGCBarriers,
                                           AggValueSlot::IsNotAliased,
                                           AggValueSlot::DoesNotOverlap));
@@ -2327,7 +2327,7 @@ llvm::Value *CodeGenFunction::EmitARCStoreStrong(LValue dst,
       !isBlock &&
       (dst.getAlignment().isZero() ||
        dst.getAlignment() >= CharUnits::fromQuantity(PointerAlignInBytes))) {
-    return EmitARCStoreStrongCall(dst.getAddress(), newValue, ignored);
+    return EmitARCStoreStrongCall(dst.getAddress(*this), newValue, ignored);
   }
 
   // Otherwise, split it out.
@@ -2726,7 +2726,7 @@ static TryEmitResult tryEmitARCRetainLoadOfScalar(CodeGenFunction &CGF,
     result = CGF.EmitLoadOfLValue(lvalue, SourceLocation()).getScalarVal();
   } else {
     assert(type.getObjCLifetime() == Qualifiers::OCL_Weak);
-    result = CGF.EmitARCLoadWeakRetained(lvalue.getAddress());
+    result = CGF.EmitARCLoadWeakRetained(lvalue.getAddress(CGF));
   }
   return TryEmitResult(result, !shouldRetain);
 }
@@ -2750,7 +2750,7 @@ static TryEmitResult tryEmitARCRetainLoadOfScalar(CodeGenFunction &CGF,
                                                SourceLocation()).getScalarVal();
 
     // Set the source pointer to NULL.
-    CGF.EmitStoreOfScalar(getNullForVariable(lv.getAddress()), lv);
+    CGF.EmitStoreOfScalar(getNullForVariable(lv.getAddress(CGF)), lv);
 
     return TryEmitResult(result, true);
   }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 5196c4d8503e7..f6edf899b5492 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -357,7 +357,7 @@ class CGOpenMPInnerExprInfo final : public CGOpenMPInlinedRegionInfo {
                       VD->getType().getNonReferenceType(), VK_LValue,
                       C.getLocation());
       PrivScope.addPrivate(
-          VD, [&CGF, &DRE]() { return CGF.EmitLValue(&DRE).getAddress(); });
+          VD, [&CGF, &DRE]() { return CGF.EmitLValue(&DRE).getAddress(CGF); });
     }
     (void)PrivScope.Privatize();
   }
@@ -842,7 +842,7 @@ static void emitInitWithReductionInitializer(CodeGenFunction &CGF,
           RValue::getComplex(CGF.EmitLoadOfComplex(LV, DRD->getLocation()));
       break;
     case TEK_Aggregate:
-      InitRVal = RValue::getAggregate(LV.getAddress());
+      InitRVal = RValue::getAggregate(LV.getAddress(CGF));
       break;
     }
     OpaqueValueExpr OVE(DRD->getLocation(), Ty, VK_RValue);
@@ -966,7 +966,7 @@ void ReductionCodeGen::emitAggregateInitialization(
                        EmitDeclareReductionInit,
                        EmitDeclareReductionInit ? ClausesData[N].ReductionOp
                                                 : PrivateVD->getInit(),
-                       DRD, SharedLVal.getAddress());
+                       DRD, SharedLVal.getAddress(CGF));
 }
 
 ReductionCodeGen::ReductionCodeGen(ArrayRef<const Expr *> Shareds,
@@ -1007,13 +1007,13 @@ void ReductionCodeGen::emitAggregateType(CodeGenFunction &CGF, unsigned N) {
   }
   llvm::Value *Size;
   llvm::Value *SizeInChars;
-  auto *ElemType =
-      cast<llvm::PointerType>(SharedAddresses[N].first.getPointer()->getType())
-          ->getElementType();
+  auto *ElemType = cast<llvm::PointerType>(
+                       SharedAddresses[N].first.getPointer(CGF)->getType())
+                       ->getElementType();
   auto *ElemSizeOf = llvm::ConstantExpr::getSizeOf(ElemType);
   if (AsArraySection) {
-    Size = CGF.Builder.CreatePtrDiff(SharedAddresses[N].second.getPointer(),
-                                     SharedAddresses[N].first.getPointer());
+    Size = CGF.Builder.CreatePtrDiff(SharedAddresses[N].second.getPointer(CGF),
+                                     SharedAddresses[N].first.getPointer(CGF));
     Size = CGF.Builder.CreateNUWAdd(
         Size, llvm::ConstantInt::get(Size->getType(), /*V=*/1));
     SizeInChars = CGF.Builder.CreateNUWMul(Size, ElemSizeOf);
@@ -1063,7 +1063,7 @@ void ReductionCodeGen::emitInitialization(
       PrivateAddr, CGF.ConvertTypeForMem(PrivateType));
   QualType SharedType = SharedAddresses[N].first.getType();
   SharedLVal = CGF.MakeAddrLValue(
-      CGF.Builder.CreateElementBitCast(SharedLVal.getAddress(),
+      CGF.Builder.CreateElementBitCast(SharedLVal.getAddress(CGF),
                                        CGF.ConvertTypeForMem(SharedType)),
       SharedType, SharedAddresses[N].first.getBaseInfo(),
       CGF.CGM.getTBAAInfoForSubobject(SharedAddresses[N].first, SharedType));
@@ -1071,7 +1071,7 @@ void ReductionCodeGen::emitInitialization(
     emitAggregateInitialization(CGF, N, PrivateAddr, SharedLVal, DRD);
   } else if (DRD && (DRD->getInitializer() || !PrivateVD->hasInit())) {
     emitInitWithReductionInitializer(CGF, DRD, ClausesData[N].ReductionOp,
-                                     PrivateAddr, SharedLVal.getAddress(),
+                                     PrivateAddr, SharedLVal.getAddress(CGF),
                                      SharedLVal.getType());
   } else if (!DefaultInit(CGF) && PrivateVD->hasInit() &&
              !CGF.isTrivialInitializer(PrivateVD->getInit())) {
@@ -1108,15 +1108,15 @@ static LValue loadToBegin(CodeGenFunction &CGF, QualType BaseTy, QualType ElTy,
   while ((BaseTy->isPointerType() || BaseTy->isReferenceType()) &&
          !CGF.getContext().hasSameType(BaseTy, ElTy)) {
     if (const auto *PtrTy = BaseTy->getAs<PointerType>()) {
-      BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(), PtrTy);
+      BaseLV = CGF.EmitLoadOfPointerLValue(BaseLV.getAddress(CGF), PtrTy);
     } else {
-      LValue RefLVal = CGF.MakeAddrLValue(BaseLV.getAddress(), BaseTy);
+      LValue RefLVal = CGF.MakeAddrLValue(BaseLV.getAddress(CGF), BaseTy);
       BaseLV = CGF.EmitLoadOfReferenceLValue(RefLVal);
     }
     BaseTy = BaseTy->getPointeeType();
   }
   return CGF.MakeAddrLValue(
-      CGF.Builder.CreateElementBitCast(BaseLV.getAddress(),
+      CGF.Builder.CreateElementBitCast(BaseLV.getAddress(CGF),
                                        CGF.ConvertTypeForMem(ElTy)),
       BaseLV.getType(), BaseLV.getBaseInfo(),
       CGF.CGM.getTBAAInfoForSubobject(BaseLV, BaseLV.getType()));
@@ -1180,15 +1180,15 @@ Address ReductionCodeGen::adjustPrivateAddress(CodeGenFunction &CGF, unsigned N,
         loadToBegin(CGF, OrigVD->getType(), SharedAddresses[N].first.getType(),
                     OriginalBaseLValue);
     llvm::Value *Adjustment = CGF.Builder.CreatePtrDiff(
-        BaseLValue.getPointer(), SharedAddresses[N].first.getPointer());
+        BaseLValue.getPointer(CGF), SharedAddresses[N].first.getPointer(CGF));
     llvm::Value *PrivatePointer =
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
             PrivateAddr.getPointer(),
-            SharedAddresses[N].first.getAddress().getType());
+            SharedAddresses[N].first.getAddress(CGF).getType());
     llvm::Value *Ptr = CGF.Builder.CreateGEP(PrivatePointer, Adjustment);
     return castToBase(CGF, OrigVD->getType(),
                       SharedAddresses[N].first.getType(),
-                      OriginalBaseLValue.getAddress().getType(),
+                      OriginalBaseLValue.getAddress(CGF).getType(),
                       OriginalBaseLValue.getAlignment(), Ptr);
   }
   BaseDecls.emplace_back(
@@ -1381,12 +1381,12 @@ emitCombinerOrInitializer(CodeGenModule &CGM, QualType Ty,
   Address AddrIn = CGF.GetAddrOfLocalVar(&OmpInParm);
   Scope.addPrivate(In, [&CGF, AddrIn, PtrTy]() {
     return CGF.EmitLoadOfPointerLValue(AddrIn, PtrTy->castAs<PointerType>())
-        .getAddress();
+        .getAddress(CGF);
   });
   Address AddrOut = CGF.GetAddrOfLocalVar(&OmpOutParm);
   Scope.addPrivate(Out, [&CGF, AddrOut, PtrTy]() {
     return CGF.EmitLoadOfPointerLValue(AddrOut, PtrTy->castAs<PointerType>())
-        .getAddress();
+        .getAddress(CGF);
   });
   (void)Scope.Privatize();
   if (!IsCombiner && Out->hasInit() &&
@@ -1496,7 +1496,7 @@ llvm::Function *CGOpenMPRuntime::emitTaskOutlinedFunction(
         UpLoc, ThreadID,
         CGF.EmitLoadOfPointerLValue(CGF.GetAddrOfLocalVar(TaskTVar),
                                     TaskTVar->getType()->castAs<PointerType>())
-            .getPointer()};
+            .getPointer(CGF)};
     CGF.EmitRuntimeCall(createRuntimeFunction(OMPRTL__kmpc_omp_task), TaskArgs);
   };
   CGOpenMPTaskOutlinedRegionInfo::UntiedTaskActionTy Action(Tied, PartIDVar,
@@ -1707,9 +1707,10 @@ llvm::Value *CGOpenMPRuntime::getThreadID(CodeGenFunction &CGF,
       if (!CGF.EHStack.requiresLandingPad() || !CGF.getLangOpts().Exceptions ||
           !CGF.getLangOpts().CXXExceptions ||
           CGF.Builder.GetInsertBlock() == TopBlock ||
-          !isa<llvm::Instruction>(LVal.getPointer()) ||
-          cast<llvm::Instruction>(LVal.getPointer())->getParent() == TopBlock ||
-          cast<llvm::Instruction>(LVal.getPointer())->getParent() ==
+          !isa<llvm::Instruction>(LVal.getPointer(CGF)) ||
+          cast<llvm::Instruction>(LVal.getPointer(CGF))->getParent() ==
+              TopBlock ||
+          cast<llvm::Instruction>(LVal.getPointer(CGF))->getParent() ==
               CGF.Builder.GetInsertBlock()) {
         ThreadID = CGF.EmitLoadOfScalar(LVal, Loc);
         // If value loaded in entry block, cache it and use it everywhere in
@@ -3119,7 +3120,7 @@ Address CGOpenMPRuntime::emitThreadIDAddress(CodeGenFunction &CGF,
   if (auto *OMPRegionInfo =
           dyn_cast_or_null<CGOpenMPRegionInfo>(CGF.CapturedStmtInfo))
     if (OMPRegionInfo->getThreadIDVariable())
-      return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress();
+      return OMPRegionInfo->getThreadIDVariableLValue(CGF).getAddress(CGF);
 
   llvm::Value *ThreadID = getThreadID(CGF, Loc);
   QualType Int32Ty =
@@ -3395,7 +3396,8 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
       Address Elem = CGF.Builder.CreateConstArrayGEP(CopyprivateList, I);
       CGF.Builder.CreateStore(
           CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-              CGF.EmitLValue(CopyprivateVars[I]).getPointer(), CGF.VoidPtrTy),
+              CGF.EmitLValue(CopyprivateVars[I]).getPointer(CGF),
+              CGF.VoidPtrTy),
           Elem);
     }
     // Build function that copies private values from single region to all other
@@ -4540,7 +4542,7 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
   const auto *KmpTaskTQTyRD = cast<RecordDecl>(KmpTaskTQTy->getAsTagDecl());
   auto PartIdFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTPartId);
   LValue PartIdLVal = CGF.EmitLValueForField(Base, *PartIdFI);
-  llvm::Value *PartidParam = PartIdLVal.getPointer();
+  llvm::Value *PartidParam = PartIdLVal.getPointer(CGF);
 
   auto SharedsFI = std::next(KmpTaskTQTyRD->field_begin(), KmpTaskTShareds);
   LValue SharedsLVal = CGF.EmitLValueForField(Base, *SharedsFI);
@@ -4553,7 +4555,7 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
   if (PrivatesFI != KmpTaskTWithPrivatesQTyRD->field_end()) {
     LValue PrivatesLVal = CGF.EmitLValueForField(TDBase, *PrivatesFI);
     PrivatesParam = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-        PrivatesLVal.getPointer(), CGF.VoidPtrTy);
+        PrivatesLVal.getPointer(CGF), CGF.VoidPtrTy);
   } else {
     PrivatesParam = llvm::ConstantPointerNull::get(CGF.VoidPtrTy);
   }
@@ -4562,7 +4564,7 @@ emitProxyTaskFunction(CodeGenModule &CGM, SourceLocation Loc,
                                TaskPrivatesMap,
                                CGF.Builder
                                    .CreatePointerBitCastOrAddrSpaceCast(
-                                       TDBase.getAddress(), CGF.VoidPtrTy)
+                                       TDBase.getAddress(CGF), CGF.VoidPtrTy)
                                    .getPointer()};
   SmallVector<llvm::Value *, 16> CallArgs(std::begin(CommonArgs),
                                           std::end(CommonArgs));
@@ -4640,7 +4642,7 @@ static llvm::Value *emitDestructorsFunction(CodeGenModule &CGM,
     if (QualType::DestructionKind DtorKind =
             Field->getType().isDestructedType()) {
       LValue FieldLValue = CGF.EmitLValueForField(Base, Field);
-      CGF.pushDestroy(DtorKind, FieldLValue.getAddress(), Field->getType());
+      CGF.pushDestroy(DtorKind, FieldLValue.getAddress(CGF), Field->getType());
     }
   }
   CGF.FinishFunction();
@@ -4738,8 +4740,8 @@ emitTaskPrivateMappingFunction(CodeGenModule &CGM, SourceLocation Loc,
     LValue RefLVal =
         CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
     LValue RefLoadLVal = CGF.EmitLoadOfPointerLValue(
-        RefLVal.getAddress(), RefLVal.getType()->castAs<PointerType>());
-    CGF.EmitStoreOfScalar(FieldLVal.getPointer(), RefLoadLVal);
+        RefLVal.getAddress(CGF), RefLVal.getType()->castAs<PointerType>());
+    CGF.EmitStoreOfScalar(FieldLVal.getPointer(CGF), RefLoadLVal);
     ++Counter;
   }
   CGF.FinishFunction();
@@ -4804,7 +4806,8 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
         } else {
           SharedRefLValue = CGF.EmitLValueForField(SrcBase, SharedField);
           SharedRefLValue = CGF.MakeAddrLValue(
-              Address(SharedRefLValue.getPointer(), C.getDeclAlign(OriginalVD)),
+              Address(SharedRefLValue.getPointer(CGF),
+                      C.getDeclAlign(OriginalVD)),
               SharedRefLValue.getType(), LValueBaseInfo(AlignmentSource::Decl),
               SharedRefLValue.getTBAAInfo());
         }
@@ -4817,7 +4820,8 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
             // Initialize firstprivate array using element-by-element
             // initialization.
             CGF.EmitOMPAggregateAssign(
-                PrivateLValue.getAddress(), SharedRefLValue.getAddress(), Type,
+                PrivateLValue.getAddress(CGF), SharedRefLValue.getAddress(CGF),
+                Type,
                 [&CGF, Elem, Init, &CapturesInfo](Address DestElement,
                                                   Address SrcElement) {
                   // Clean up any temporaries needed by the initialization.
@@ -4835,8 +4839,8 @@ static void emitPrivatesInit(CodeGenFunction &CGF,
           }
         } else {
           CodeGenFunction::OMPPrivateScope InitScope(CGF);
-          InitScope.addPrivate(Elem, [SharedRefLValue]() -> Address {
-            return SharedRefLValue.getAddress();
+          InitScope.addPrivate(Elem, [SharedRefLValue, &CGF]() -> Address {
+            return SharedRefLValue.getAddress(CGF);
           });
           (void)InitScope.Privatize();
           CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CapturesInfo);
@@ -5236,10 +5240,10 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
               dyn_cast<OMPArraySectionExpr>(E->IgnoreParenImpCasts())) {
         LValue UpAddrLVal =
             CGF.EmitOMPArraySectionExpr(ASE, /*IsLowerBound=*/false);
-        llvm::Value *UpAddr =
-            CGF.Builder.CreateConstGEP1_32(UpAddrLVal.getPointer(), /*Idx0=*/1);
+        llvm::Value *UpAddr = CGF.Builder.CreateConstGEP1_32(
+            UpAddrLVal.getPointer(CGF), /*Idx0=*/1);
         llvm::Value *LowIntPtr =
-            CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGM.SizeTy);
+            CGF.Builder.CreatePtrToInt(Addr.getPointer(CGF), CGM.SizeTy);
         llvm::Value *UpIntPtr = CGF.Builder.CreatePtrToInt(UpAddr, CGM.SizeTy);
         Size = CGF.Builder.CreateNUWSub(UpIntPtr, LowIntPtr);
       } else {
@@ -5252,7 +5256,7 @@ void CGOpenMPRuntime::emitTaskCall(CodeGenFunction &CGF, SourceLocation Loc,
       LValue BaseAddrLVal = CGF.EmitLValueForField(
           Base, *std::next(KmpDependInfoRD->field_begin(), BaseAddr));
       CGF.EmitStoreOfScalar(
-          CGF.Builder.CreatePtrToInt(Addr.getPointer(), CGF.IntPtrTy),
+          CGF.Builder.CreatePtrToInt(Addr.getPointer(CGF), CGF.IntPtrTy),
           BaseAddrLVal);
       // deps[i].len = sizeof(<Dependences[i].second>);
       LValue LenLVal = CGF.EmitLValueForField(
@@ -5406,21 +5410,24 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTLowerBound));
   const auto *LBVar =
       cast<VarDecl>(cast<DeclRefExpr>(D.getLowerBoundVariable())->getDecl());
-  CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(), LBLVal.getQuals(),
+  CGF.EmitAnyExprToMem(LBVar->getInit(), LBLVal.getAddress(CGF),
+                       LBLVal.getQuals(),
                        /*IsInitializer=*/true);
   LValue UBLVal = CGF.EmitLValueForField(
       Result.TDBase,
       *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTUpperBound));
   const auto *UBVar =
       cast<VarDecl>(cast<DeclRefExpr>(D.getUpperBoundVariable())->getDecl());
-  CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(), UBLVal.getQuals(),
+  CGF.EmitAnyExprToMem(UBVar->getInit(), UBLVal.getAddress(CGF),
+                       UBLVal.getQuals(),
                        /*IsInitializer=*/true);
   LValue StLVal = CGF.EmitLValueForField(
       Result.TDBase,
       *std::next(Result.KmpTaskTQTyRD->field_begin(), KmpTaskTStride));
   const auto *StVar =
       cast<VarDecl>(cast<DeclRefExpr>(D.getStrideVariable())->getDecl());
-  CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(), StLVal.getQuals(),
+  CGF.EmitAnyExprToMem(StVar->getInit(), StLVal.getAddress(CGF),
+                       StLVal.getQuals(),
                        /*IsInitializer=*/true);
   // Store reductions address.
   LValue RedLVal = CGF.EmitLValueForField(
@@ -5429,7 +5436,7 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
   if (Data.Reductions) {
     CGF.EmitStoreOfScalar(Data.Reductions, RedLVal);
   } else {
-    CGF.EmitNullInitialization(RedLVal.getAddress(),
+    CGF.EmitNullInitialization(RedLVal.getAddress(CGF),
                                CGF.getContext().VoidPtrTy);
   }
   enum { NoSchedule = 0, Grainsize = 1, NumTasks = 2 };
@@ -5438,11 +5445,11 @@ void CGOpenMPRuntime::emitTaskLoopCall(CodeGenFunction &CGF, SourceLocation Loc,
       ThreadID,
       Result.NewTask,
       IfVal,
-      LBLVal.getPointer(),
-      UBLVal.getPointer(),
+      LBLVal.getPointer(CGF),
+      UBLVal.getPointer(CGF),
       CGF.EmitLoadOfScalar(StLVal, Loc),
       llvm::ConstantInt::getSigned(
-              CGF.IntTy, 1), // Always 1 because taskgroup emitted by the compiler
+          CGF.IntTy, 1), // Always 1 because taskgroup emitted by the compiler
       llvm::ConstantInt::getSigned(
           CGF.IntTy, Data.Schedule.getPointer()
                          ? Data.Schedule.getInt() ? NumTasks : Grainsize
@@ -5754,7 +5761,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
     CGF.Builder.CreateStore(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
+            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
         Elem);
     if ((*IPriv)->getType()->isVariablyModifiedType()) {
       // Store array size.
@@ -6234,7 +6241,7 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
     LValue SharedLVal = CGF.EmitLValueForField(ElemLVal, SharedFD);
     RCG.emitSharedLValue(CGF, Cnt);
     llvm::Value *CastedShared =
-        CGF.EmitCastToVoidPtr(RCG.getSharedLValue(Cnt).getPointer());
+        CGF.EmitCastToVoidPtr(RCG.getSharedLValue(Cnt).getPointer(CGF));
     CGF.EmitStoreOfScalar(CastedShared, SharedLVal);
     RCG.emitAggregateType(CGF, Cnt);
     llvm::Value *SizeValInChars;
@@ -6277,7 +6284,8 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
           llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1, /*isSigned=*/true),
           FlagsLVal);
     } else
-      CGF.EmitNullInitialization(FlagsLVal.getAddress(), FlagsLVal.getType());
+      CGF.EmitNullInitialization(FlagsLVal.getAddress(CGF),
+                                 FlagsLVal.getType());
   }
   // Build call void *__kmpc_task_reduction_init(int gtid, int num_data, void
   // *data);
@@ -6313,7 +6321,7 @@ void CGOpenMPRuntime::emitTaskReductionFixups(CodeGenFunction &CGF,
         generateUniqueName(CGM, "reduction", RCG.getRefExpr(N)));
     CGF.Builder.CreateStore(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            RCG.getSharedLValue(N).getPointer(), CGM.VoidPtrTy),
+            RCG.getSharedLValue(N).getPointer(CGF), CGM.VoidPtrTy),
         SharedAddr, /*IsVolatile=*/false);
   }
 }
@@ -6324,12 +6332,12 @@ Address CGOpenMPRuntime::getTaskReductionItem(CodeGenFunction &CGF,
                                               LValue SharedLVal) {
   // Build call void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void
   // *d);
-  llvm::Value *Args[] = {
-      CGF.Builder.CreateIntCast(getThreadID(CGF, Loc), CGM.IntTy,
-                                /*isSigned=*/true),
-      ReductionsPtr,
-      CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(SharedLVal.getPointer(),
-                                                      CGM.VoidPtrTy)};
+  llvm::Value *Args[] = {CGF.Builder.CreateIntCast(getThreadID(CGF, Loc),
+                                                   CGM.IntTy,
+                                                   /*isSigned=*/true),
+                         ReductionsPtr,
+                         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
+                             SharedLVal.getPointer(CGF), CGM.VoidPtrTy)};
   return Address(
       CGF.EmitRuntimeCall(
           createRuntimeFunction(OMPRTL__kmpc_task_reduction_get_th_data), Args),
@@ -7514,11 +7522,11 @@ class MappableExprsHandler {
     } else if ((AE && isa<CXXThisExpr>(AE->getBase()->IgnoreParenImpCasts())) ||
                (OASE &&
                 isa<CXXThisExpr>(OASE->getBase()->IgnoreParenImpCasts()))) {
-      BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress();
+      BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress(CGF);
     } else {
       // The base is the reference to the variable.
       // BP = &Var.
-      BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress();
+      BP = CGF.EmitOMPSharedLValue(AssocExpr).getAddress(CGF);
       if (const auto *VD =
               dyn_cast_or_null<VarDecl>(I->getAssociatedDeclaration())) {
         if (llvm::Optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
@@ -7612,8 +7620,8 @@ class MappableExprsHandler {
                 isa<OMPArraySectionExpr>(Next->getAssociatedExpression())) &&
                "Unexpected expression");
 
-        Address LB =
-            CGF.EmitOMPSharedLValue(I->getAssociatedExpression()).getAddress();
+        Address LB = CGF.EmitOMPSharedLValue(I->getAssociatedExpression())
+                         .getAddress(CGF);
 
         // If this component is a pointer inside the base struct then we don't
         // need to create any entry for it - it will be combined with the object
@@ -7660,7 +7668,7 @@ class MappableExprsHandler {
               if (MC.getAssociatedDeclaration()) {
                 ComponentLB =
                     CGF.EmitOMPSharedLValue(MC.getAssociatedExpression())
-                        .getAddress();
+                        .getAddress(CGF);
                 Size = CGF.Builder.CreatePtrDiff(
                     CGF.EmitCastToVoidPtr(ComponentLB.getPointer()),
                     CGF.EmitCastToVoidPtr(LB.getPointer()));
@@ -8064,7 +8072,7 @@ class MappableExprsHandler {
       auto CI = DeferredInfo.find(M.first);
       if (CI != DeferredInfo.end()) {
         for (const DeferredDevicePtrEntryTy &L : CI->second) {
-          llvm::Value *BasePtr = this->CGF.EmitLValue(L.IE).getPointer();
+          llvm::Value *BasePtr = this->CGF.EmitLValue(L.IE).getPointer(CGF);
           llvm::Value *Ptr = this->CGF.EmitLoadOfScalar(
               this->CGF.EmitLValue(L.IE), L.IE->getExprLoc());
           CurBasePointers.emplace_back(BasePtr, L.VD);
@@ -8186,9 +8194,10 @@ class MappableExprsHandler {
       LValue ThisLVal =
           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
       LValue ThisLValVal = CGF.EmitLValueForField(VDLVal, ThisCapture);
-      LambdaPointers.try_emplace(ThisLVal.getPointer(), VDLVal.getPointer());
-      BasePointers.push_back(ThisLVal.getPointer());
-      Pointers.push_back(ThisLValVal.getPointer());
+      LambdaPointers.try_emplace(ThisLVal.getPointer(CGF),
+                                 VDLVal.getPointer(CGF));
+      BasePointers.push_back(ThisLVal.getPointer(CGF));
+      Pointers.push_back(ThisLValVal.getPointer(CGF));
       Sizes.push_back(
           CGF.Builder.CreateIntCast(CGF.getTypeSize(CGF.getContext().VoidPtrTy),
                                     CGF.Int64Ty, /*isSigned=*/true));
@@ -8206,17 +8215,19 @@ class MappableExprsHandler {
       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
       if (LC.getCaptureKind() == LCK_ByRef) {
         LValue VarLValVal = CGF.EmitLValueForField(VDLVal, It->second);
-        LambdaPointers.try_emplace(VarLVal.getPointer(), VDLVal.getPointer());
-        BasePointers.push_back(VarLVal.getPointer());
-        Pointers.push_back(VarLValVal.getPointer());
+        LambdaPointers.try_emplace(VarLVal.getPointer(CGF),
+                                   VDLVal.getPointer(CGF));
+        BasePointers.push_back(VarLVal.getPointer(CGF));
+        Pointers.push_back(VarLValVal.getPointer(CGF));
         Sizes.push_back(CGF.Builder.CreateIntCast(
             CGF.getTypeSize(
                 VD->getType().getCanonicalType().getNonReferenceType()),
             CGF.Int64Ty, /*isSigned=*/true));
       } else {
         RValue VarRVal = CGF.EmitLoadOfLValue(VarLVal, RD->getLocation());
-        LambdaPointers.try_emplace(VarLVal.getPointer(), VDLVal.getPointer());
-        BasePointers.push_back(VarLVal.getPointer());
+        LambdaPointers.try_emplace(VarLVal.getPointer(CGF),
+                                   VDLVal.getPointer(CGF));
+        BasePointers.push_back(VarLVal.getPointer(CGF));
         Pointers.push_back(VarRVal.getScalarVal());
         Sizes.push_back(llvm::ConstantInt::get(CGF.Int64Ty, 0));
       }
@@ -8522,7 +8533,7 @@ class MappableExprsHandler {
             CGF.CGM.getOpenMPRuntime().registerTargetFirstprivateCopy(CGF, VD);
         // Copy the value of the original variable to the new global copy.
         CGF.Builder.CreateMemCpy(
-            CGF.MakeNaturalAlignAddrLValue(Addr, ElementType).getAddress(),
+            CGF.MakeNaturalAlignAddrLValue(Addr, ElementType).getAddress(CGF),
             Address(CV, CGF.getContext().getTypeAlignInChars(ElementType)),
             CurSizes.back(), /*IsVolatile=*/false);
         // Use new global variable as the base pointers.
@@ -8932,7 +8943,7 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
   Scope.addPrivate(MapperVarDecl, [&MapperCGF, PtrCurrent, PtrTy]() {
     return MapperCGF
         .EmitLoadOfPointerLValue(PtrCurrent, PtrTy->castAs<PointerType>())
-        .getAddress();
+        .getAddress(MapperCGF);
   });
   (void)Scope.Privatize();
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index abfba39e6be17..e5ec3deac2c94 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -2318,7 +2318,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
         VarTy = Rec.second.FD->getType();
       } else {
         llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
-            VarAddr.getAddress().getPointer(),
+            VarAddr.getAddress(CGF).getPointer(),
             {Bld.getInt32(0), getNVPTXLaneID(CGF)});
         VarTy =
             Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
@@ -2326,7 +2326,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
             Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
             AlignmentSource::Decl);
       }
-      Rec.second.PrivateAddr = VarAddr.getAddress();
+      Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
       if (!IsInTTDRegion &&
           (WithSPMDCheck ||
            getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
@@ -2337,10 +2337,10 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
                  "Secondary glob data must be one per team.");
           LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
           VarAddr.setAddress(
-              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
-                                       VarAddr.getPointer()),
+              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
+                                       VarAddr.getPointer(CGF)),
                       VarAddr.getAlignment()));
-          Rec.second.PrivateAddr = VarAddr.getAddress();
+          Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
         }
         Address GlobalPtr = Rec.second.PrivateAddr;
         Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
@@ -2352,7 +2352,8 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
       if (EscapedParam) {
         const auto *VD = cast<VarDecl>(Rec.first);
         CGF.EmitStoreOfScalar(ParValue, VarAddr);
-        I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
+        I->getSecond().MappedParams->setVarAddr(CGF, VD,
+                                                VarAddr.getAddress(CGF));
       }
       if (IsTTD)
         ++SecIt;
@@ -2386,7 +2387,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
                                      CGM.getContext().getDeclAlign(VD),
                                      AlignmentSource::Decl);
     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
-                                            Base.getAddress());
+                                            Base.getAddress(CGF));
     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
   }
   I->getSecond().MappedParams->apply(CGF);
@@ -3690,7 +3691,8 @@ static llvm::Value *emitListToGlobalCopyFunction(
     const FieldDecl *FD = VarFieldMap.lookup(VD);
     LValue GlobLVal = CGF.EmitLValueForField(
         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
-    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
+    llvm::Value *BufferPtr =
+        Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
     GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
@@ -3787,7 +3789,8 @@ static llvm::Value *emitListToGlobalReduceFunction(
     const FieldDecl *FD = VarFieldMap.lookup(VD);
     LValue GlobLVal = CGF.EmitLValueForField(
         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
-    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
+    llvm::Value *BufferPtr =
+        Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
     if ((*IPriv)->getType()->isVariablyModifiedType()) {
@@ -3891,7 +3894,8 @@ static llvm::Value *emitGlobalToListCopyFunction(
     const FieldDecl *FD = VarFieldMap.lookup(VD);
     LValue GlobLVal = CGF.EmitLValueForField(
         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
-    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
+    llvm::Value *BufferPtr =
+        Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
     GlobLVal.setAddress(Address(BufferPtr, GlobLVal.getAlignment()));
     switch (CGF.getEvaluationKind(Private->getType())) {
     case TEK_Scalar: {
@@ -3987,7 +3991,8 @@ static llvm::Value *emitGlobalToListReduceFunction(
     const FieldDecl *FD = VarFieldMap.lookup(VD);
     LValue GlobLVal = CGF.EmitLValueForField(
         CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
-    llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobLVal.getPointer(), Idxs);
+    llvm::Value *BufferPtr =
+        Bld.CreateInBoundsGEP(GlobLVal.getPointer(CGF), Idxs);
     llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
     CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
     if ((*IPriv)->getType()->isVariablyModifiedType()) {
@@ -4310,7 +4315,7 @@ void CGOpenMPRuntimeNVPTX::emitReduction(
     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
     CGF.Builder.CreateStore(
         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
+            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
         Elem);
     if ((*IPriv)->getType()->isVariablyModifiedType()) {
       // Store array size.
@@ -4892,7 +4897,7 @@ void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
       if (VD->getType().getCanonicalType()->isReferenceType())
         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
                                                VD->getType().getCanonicalType())
-                     .getAddress();
+                     .getAddress(CGF);
       CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
     }
   }
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index f3a4e98edc3a5..1005855a5cadb 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -1857,15 +1857,15 @@ CodeGenFunction::EmitAsmInputLValue(const TargetInfo::ConstraintInfo &Info,
         Ty = llvm::IntegerType::get(getLLVMContext(), Size);
         Ty = llvm::PointerType::getUnqual(Ty);
 
-        Arg = Builder.CreateLoad(Builder.CreateBitCast(InputValue.getAddress(),
-                                                       Ty));
+        Arg = Builder.CreateLoad(
+            Builder.CreateBitCast(InputValue.getAddress(*this), Ty));
       } else {
-        Arg = InputValue.getPointer();
+        Arg = InputValue.getPointer(*this);
         ConstraintStr += '*';
       }
     }
   } else {
-    Arg = InputValue.getPointer();
+    Arg = InputValue.getPointer(*this);
     ConstraintStr += '*';
   }
 
@@ -2114,8 +2114,8 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
         LargestVectorWidth = std::max((uint64_t)LargestVectorWidth,
                                    VT->getPrimitiveSizeInBits().getFixedSize());
     } else {
-      ArgTypes.push_back(Dest.getAddress().getType());
-      Args.push_back(Dest.getPointer());
+      ArgTypes.push_back(Dest.getAddress(*this).getType());
+      Args.push_back(Dest.getPointer(*this));
       Constraints += "=*";
       Constraints += OutputConstraint;
       ReadOnly = ReadNone = false;
@@ -2357,7 +2357,7 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
     // ResultTypeRequiresCast.size() elements of RegResults.
     if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) {
       unsigned Size = getContext().getTypeSize(ResultRegQualTys[i]);
-      Address A = Builder.CreateBitCast(Dest.getAddress(),
+      Address A = Builder.CreateBitCast(Dest.getAddress(*this),
                                         ResultRegTypes[i]->getPointerTo());
       QualType Ty = getContext().getIntTypeForBitwidth(Size, /*Signed*/ false);
       if (Ty.isNull()) {
@@ -2410,14 +2410,14 @@ CodeGenFunction::EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K) {
   delete CGF.CapturedStmtInfo;
 
   // Emit call to the helper function.
-  EmitCallOrInvoke(F, CapStruct.getPointer());
+  EmitCallOrInvoke(F, CapStruct.getPointer(*this));
 
   return F;
 }
 
 Address CodeGenFunction::GenerateCapturedStmtArgument(const CapturedStmt &S) {
   LValue CapStruct = InitCapturedStruct(S);
-  return CapStruct.getAddress();
+  return CapStruct.getAddress(*this);
 }
 
 /// Creates the outlined function for a CapturedStmt.
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e2c055f549e02..1e6933df7084d 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/DeclOpenMP.h"
@@ -77,7 +78,7 @@ class OMPLexicalScope : public CodeGenFunction::LexicalScope {
                                        InlinedShareds.isGlobalVarCaptured(VD)),
             VD->getType().getNonReferenceType(), VK_LValue, C.getLocation());
         InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
-          return CGF.EmitLValue(&DRE).getAddress();
+          return CGF.EmitLValue(&DRE).getAddress(CGF);
         });
       }
     }
@@ -232,7 +233,7 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
                           VD->getType().getNonReferenceType(), VK_LValue,
                           C.getLocation());
           InlinedShareds.addPrivate(VD, [&CGF, &DRE]() -> Address {
-            return CGF.EmitLValue(&DRE).getAddress();
+            return CGF.EmitLValue(&DRE).getAddress(CGF);
           });
         }
       }
@@ -325,7 +326,7 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
       CapturedVars.push_back(CV);
     } else {
       assert(CurCap->capturesVariable() && "Expected capture by reference.");
-      CapturedVars.push_back(EmitLValue(*I).getAddress().getPointer());
+      CapturedVars.push_back(EmitLValue(*I).getAddress(*this).getPointer());
     }
   }
 }
@@ -336,11 +337,11 @@ static Address castValueFromUintptr(CodeGenFunction &CGF, SourceLocation Loc,
   ASTContext &Ctx = CGF.getContext();
 
   llvm::Value *CastedPtr = CGF.EmitScalarConversion(
-      AddrLV.getAddress().getPointer(), Ctx.getUIntPtrType(),
+      AddrLV.getAddress(CGF).getPointer(), Ctx.getUIntPtrType(),
       Ctx.getPointerType(DstType), Loc);
   Address TmpAddr =
       CGF.MakeNaturalAlignAddrLValue(CastedPtr, Ctx.getPointerType(DstType))
-          .getAddress();
+          .getAddress(CGF);
   return TmpAddr;
 }
 
@@ -519,7 +520,7 @@ static llvm::Function *emitOutlinedFunctionPrologue(
     } else if (I->capturesVariable()) {
       const VarDecl *Var = I->getCapturedVar();
       QualType VarTy = Var->getType();
-      Address ArgAddr = ArgLVal.getAddress();
+      Address ArgAddr = ArgLVal.getAddress(CGF);
       if (ArgLVal.getType()->isLValueReferenceType()) {
         ArgAddr = CGF.EmitLoadOfReference(ArgLVal);
       } else if (!VarTy->isVariablyModifiedType() || !VarTy->isPointerType()) {
@@ -541,12 +542,12 @@ static llvm::Function *emitOutlinedFunctionPrologue(
                                    ? castValueFromUintptr(
                                          CGF, I->getLocation(), FD->getType(),
                                          Args[Cnt]->getName(), ArgLVal)
-                                   : ArgLVal.getAddress()}});
+                                   : ArgLVal.getAddress(CGF)}});
     } else {
       // If 'this' is captured, load it into CXXThisValue.
       assert(I->capturesThis());
       CXXThisValue = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation());
-      LocalAddrs.insert({Args[Cnt], {nullptr, ArgLVal.getAddress()}});
+      LocalAddrs.insert({Args[Cnt], {nullptr, ArgLVal.getAddress(CGF)}});
     }
     ++Cnt;
     ++I;
@@ -830,8 +831,8 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
                   EmitAggregateAssign(Dest, OriginalLVal, Type);
                 } else {
                   EmitOMPAggregateAssign(
-                      Emission.getAllocatedAddress(), OriginalLVal.getAddress(),
-                      Type,
+                      Emission.getAllocatedAddress(),
+                      OriginalLVal.getAddress(*this), Type,
                       [this, VDInit, Init](Address DestElement,
                                            Address SrcElement) {
                         // Clean up any temporaries needed by the
@@ -849,7 +850,7 @@ bool CodeGenFunction::EmitOMPFirstprivateClause(const OMPExecutableDirective &D,
                 return Emission.getAllocatedAddress();
               });
         } else {
-          Address OriginalAddr = OriginalLVal.getAddress();
+          Address OriginalAddr = OriginalLVal.getAddress(*this);
           IsRegistered = PrivateScope.addPrivate(
               OrigVD, [this, VDInit, OriginalAddr, VD]() {
                 // Emit private VarDecl with copy init.
@@ -926,7 +927,7 @@ bool CodeGenFunction::EmitOMPCopyinClause(const OMPExecutableDirective &D) {
                  "Copyin threadprivates should have been captured!");
           DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(VD), true,
                           (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
-          MasterAddr = EmitLValue(&DRE).getAddress();
+          MasterAddr = EmitLValue(&DRE).getAddress(*this);
           LocalDeclMap.erase(VD);
         } else {
           MasterAddr =
@@ -935,7 +936,7 @@ bool CodeGenFunction::EmitOMPCopyinClause(const OMPExecutableDirective &D) {
                     getContext().getDeclAlign(VD));
         }
         // Get the address of the threadprivate variable.
-        Address PrivateAddr = EmitLValue(*IRef).getAddress();
+        Address PrivateAddr = EmitLValue(*IRef).getAddress(*this);
         if (CopiedVars.size() == 1) {
           // At first check if current thread is a master thread. If it is, no
           // need to copy data.
@@ -1003,7 +1004,7 @@ bool CodeGenFunction::EmitOMPLastprivateClauseInit(
                           /*RefersToEnclosingVariableOrCapture=*/
                               CapturedStmtInfo->lookup(OrigVD) != nullptr,
                           (*IRef)->getType(), VK_LValue, (*IRef)->getExprLoc());
-          return EmitLValue(&DRE).getAddress();
+          return EmitLValue(&DRE).getAddress(*this);
         });
         // Check if the variable is also a firstprivate: in this case IInit is
         // not generated. Initialization of this variable will happen in codegen
@@ -1160,8 +1161,8 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     if (isaOMPArraySectionExpr && Type->isVariablyModifiedType()) {
       // Store the address of the original variable associated with the LHS
       // implicit variable.
-      PrivateScope.addPrivate(LHSVD, [&RedCG, Count]() {
-        return RedCG.getSharedLValue(Count).getAddress();
+      PrivateScope.addPrivate(LHSVD, [&RedCG, Count, this]() {
+        return RedCG.getSharedLValue(Count).getAddress(*this);
       });
       PrivateScope.addPrivate(
           RHSVD, [this, PrivateVD]() { return GetAddrOfLocalVar(PrivateVD); });
@@ -1169,8 +1170,8 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
                isa<ArraySubscriptExpr>(IRef)) {
       // Store the address of the original variable associated with the LHS
       // implicit variable.
-      PrivateScope.addPrivate(LHSVD, [&RedCG, Count]() {
-        return RedCG.getSharedLValue(Count).getAddress();
+      PrivateScope.addPrivate(LHSVD, [&RedCG, Count, this]() {
+        return RedCG.getSharedLValue(Count).getAddress(*this);
       });
       PrivateScope.addPrivate(RHSVD, [this, PrivateVD, RHSVD]() {
         return Builder.CreateElementBitCast(GetAddrOfLocalVar(PrivateVD),
@@ -1180,7 +1181,7 @@ void CodeGenFunction::EmitOMPReductionClauseInit(
     } else {
       QualType Type = PrivateVD->getType();
       bool IsArray = getContext().getAsArrayType(Type) != nullptr;
-      Address OriginalAddr = RedCG.getSharedLValue(Count).getAddress();
+      Address OriginalAddr = RedCG.getSharedLValue(Count).getAddress(*this);
       // Store the address of the original variable associated with the LHS
       // implicit variable.
       if (IsArray) {
@@ -1529,7 +1530,7 @@ void CodeGenFunction::EmitOMPLinearClauseFinal(
       DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(OrigVD),
                       CapturedStmtInfo->lookup(OrigVD) != nullptr,
                       (*IC)->getType(), VK_LValue, (*IC)->getExprLoc());
-      Address OrigAddr = EmitLValue(&DRE).getAddress();
+      Address OrigAddr = EmitLValue(&DRE).getAddress(*this);
       CodeGenFunction::OMPPrivateScope VarScope(*this);
       VarScope.addPrivate(OrigVD, [OrigAddr]() { return OrigAddr; });
       (void)VarScope.Privatize();
@@ -1599,7 +1600,7 @@ void CodeGenFunction::EmitOMPPrivateLoopCounters(
         DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(VD),
                         LocalDeclMap.count(VD) || CapturedStmtInfo->lookup(VD),
                         E->getType(), VK_LValue, E->getExprLoc());
-        return EmitLValue(&DRE).getAddress();
+        return EmitLValue(&DRE).getAddress(*this);
       });
     } else {
       (void)LoopScope.addPrivate(PrivateVD, [&VarEmission]() {
@@ -1762,12 +1763,13 @@ void CodeGenFunction::EmitOMPSimdFinal(
       }
       Address OrigAddr = Address::invalid();
       if (CED) {
-        OrigAddr = EmitLValue(CED->getInit()->IgnoreImpCasts()).getAddress();
+        OrigAddr =
+            EmitLValue(CED->getInit()->IgnoreImpCasts()).getAddress(*this);
       } else {
         DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(PrivateVD),
                         /*RefersToEnclosingVariableOrCapture=*/false,
                         (*IPC)->getType(), VK_LValue, (*IPC)->getExprLoc());
-        OrigAddr = EmitLValue(&DRE).getAddress();
+        OrigAddr = EmitLValue(&DRE).getAddress(*this);
       }
       OMPPrivateScope VarScope(*this);
       VarScope.addPrivate(OrigVD, [OrigAddr]() { return OrigAddr; });
@@ -2277,14 +2279,16 @@ static void emitDistributeParallelForDistributeInnerBoundParams(
   const auto &Dir = cast<OMPLoopDirective>(S);
   LValue LB =
       CGF.EmitLValue(cast<DeclRefExpr>(Dir.getCombinedLowerBoundVariable()));
-  llvm::Value *LBCast = CGF.Builder.CreateIntCast(
-      CGF.Builder.CreateLoad(LB.getAddress()), CGF.SizeTy, /*isSigned=*/false);
+  llvm::Value *LBCast =
+      CGF.Builder.CreateIntCast(CGF.Builder.CreateLoad(LB.getAddress(CGF)),
+                                CGF.SizeTy, /*isSigned=*/false);
   CapturedVars.push_back(LBCast);
   LValue UB =
       CGF.EmitLValue(cast<DeclRefExpr>(Dir.getCombinedUpperBoundVariable()));
 
-  llvm::Value *UBCast = CGF.Builder.CreateIntCast(
-      CGF.Builder.CreateLoad(UB.getAddress()), CGF.SizeTy, /*isSigned=*/false);
+  llvm::Value *UBCast =
+      CGF.Builder.CreateIntCast(CGF.Builder.CreateLoad(UB.getAddress(CGF)),
+                                CGF.SizeTy, /*isSigned=*/false);
   CapturedVars.push_back(UBCast);
 }
 
@@ -2521,8 +2525,8 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
               // one chunk is distributed to each thread. Note that the size of
               // the chunks is unspecified in this case.
               CGOpenMPRuntime::StaticRTInput StaticInit(
-                  IVSize, IVSigned, Ordered, IL.getAddress(), LB.getAddress(),
-                  UB.getAddress(), ST.getAddress(),
+                  IVSize, IVSigned, Ordered, IL.getAddress(CGF),
+                  LB.getAddress(CGF), UB.getAddress(CGF), ST.getAddress(CGF),
                   StaticChunkedOne ? Chunk : nullptr);
               CGF.CGM.getOpenMPRuntime().emitForStaticInit(
                   CGF, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind,
@@ -2571,9 +2575,9 @@ bool CodeGenFunction::EmitOMPWorksharingLoop(
             ScheduleKind.M2 == OMPC_SCHEDULE_MODIFIER_monotonic;
         // Emit the outer loop, which requests its work chunk [LB..UB] from
         // runtime and runs the inner loop to process it.
-        const OMPLoopArguments LoopArguments(LB.getAddress(), UB.getAddress(),
-                                             ST.getAddress(), IL.getAddress(),
-                                             Chunk, EUB);
+        const OMPLoopArguments LoopArguments(
+            LB.getAddress(*this), UB.getAddress(*this), ST.getAddress(*this),
+            IL.getAddress(*this), Chunk, EUB);
         EmitOMPForOuterLoop(ScheduleKind, IsMonotonic, S, LoopScope, Ordered,
                             LoopArguments, CGDispatchBounds);
       }
@@ -2777,8 +2781,8 @@ void CodeGenFunction::EmitSections(const OMPExecutableDirective &S) {
     OpenMPScheduleTy ScheduleKind;
     ScheduleKind.Schedule = OMPC_SCHEDULE_static;
     CGOpenMPRuntime::StaticRTInput StaticInit(
-        /*IVSize=*/32, /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(),
-        LB.getAddress(), UB.getAddress(), ST.getAddress());
+        /*IVSize=*/32, /*IVSigned=*/true, /*Ordered=*/false, IL.getAddress(CGF),
+        LB.getAddress(CGF), UB.getAddress(CGF), ST.getAddress(CGF));
     CGF.CGM.getOpenMPRuntime().emitForStaticInit(
         CGF, S.getBeginLoc(), S.getDirectiveKind(), ScheduleKind, StaticInit);
     // UB = min(UB, GlobalUB);
@@ -3112,7 +3116,7 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
                         Pair.second->getType(), VK_LValue,
                         Pair.second->getExprLoc());
         Scope.addPrivate(Pair.first, [&CGF, &DRE]() {
-          return CGF.EmitLValue(&DRE).getAddress();
+          return CGF.EmitLValue(&DRE).getAddress(CGF);
         });
       }
       for (const auto &Pair : PrivatePtrs) {
@@ -3209,7 +3213,8 @@ void CodeGenFunction::EmitOMPTaskBasedDirective(
       S, *I, *PartId, *TaskT, S.getDirectiveKind(), CodeGen, Data.Tied,
       Data.NumberOfParts);
   OMPLexicalScope Scope(*this, S, llvm::None,
-                        !isOpenMPParallelDirective(S.getDirectiveKind()));
+                        !isOpenMPParallelDirective(S.getDirectiveKind()) &&
+                            !isOpenMPSimdDirective(S.getDirectiveKind()));
   TaskGen(*this, OutlinedFn, Data);
 }
 
@@ -3570,8 +3575,8 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
         if (isOpenMPSimdDirective(S.getDirectiveKind()))
           EmitOMPSimdInit(S, /*IsMonotonic=*/true);
         CGOpenMPRuntime::StaticRTInput StaticInit(
-            IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(),
-            LB.getAddress(), UB.getAddress(), ST.getAddress(),
+            IVSize, IVSigned, /* Ordered = */ false, IL.getAddress(*this),
+            LB.getAddress(*this), UB.getAddress(*this), ST.getAddress(*this),
             StaticChunked ? Chunk : nullptr);
         RT.emitDistributeStaticInit(*this, S.getBeginLoc(), ScheduleKind,
                                     StaticInit);
@@ -3637,8 +3642,8 @@ void CodeGenFunction::EmitOMPDistributeLoop(const OMPLoopDirective &S,
         // Emit the outer loop, which requests its work chunk [LB..UB] from
         // runtime and runs the inner loop to process it.
         const OMPLoopArguments LoopArguments = {
-            LB.getAddress(), UB.getAddress(), ST.getAddress(), IL.getAddress(),
-            Chunk};
+            LB.getAddress(*this), UB.getAddress(*this), ST.getAddress(*this),
+            IL.getAddress(*this), Chunk};
         EmitOMPDistributeOuterLoop(ScheduleKind, S, LoopScope, LoopArguments,
                                    CodeGenLoop);
       }
@@ -3838,11 +3843,11 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
   // expression is simple and atomic is allowed for the given type for the
   // target platform.
   if (BO == BO_Comma || !Update.isScalar() ||
-      !Update.getScalarVal()->getType()->isIntegerTy() ||
-      !X.isSimple() || (!isa<llvm::ConstantInt>(Update.getScalarVal()) &&
-                        (Update.getScalarVal()->getType() !=
-                         X.getAddress().getElementType())) ||
-      !X.getAddress().getElementType()->isIntegerTy() ||
+      !Update.getScalarVal()->getType()->isIntegerTy() || !X.isSimple() ||
+      (!isa<llvm::ConstantInt>(Update.getScalarVal()) &&
+       (Update.getScalarVal()->getType() !=
+        X.getAddress(CGF).getElementType())) ||
+      !X.getAddress(CGF).getElementType()->isIntegerTy() ||
       !Context.getTargetInfo().hasBuiltinAtomic(
           Context.getTypeSize(X.getType()), Context.toBits(X.getAlignment())))
     return std::make_pair(false, RValue::get(nullptr));
@@ -3914,11 +3919,11 @@ static std::pair<bool, RValue> emitOMPAtomicRMW(CodeGenFunction &CGF, LValue X,
   llvm::Value *UpdateVal = Update.getScalarVal();
   if (auto *IC = dyn_cast<llvm::ConstantInt>(UpdateVal)) {
     UpdateVal = CGF.Builder.CreateIntCast(
-        IC, X.getAddress().getElementType(),
+        IC, X.getAddress(CGF).getElementType(),
         X.getType()->hasSignedIntegerRepresentation());
   }
   llvm::Value *Res =
-      CGF.Builder.CreateAtomicRMW(RMWOp, X.getPointer(), UpdateVal, AO);
+      CGF.Builder.CreateAtomicRMW(RMWOp, X.getPointer(CGF), UpdateVal, AO);
   return std::make_pair(true, RValue::get(Res));
 }
 
@@ -5101,10 +5106,7 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
       CGF.incrementProfileCounter(&S);
     }
 
-    if (isOpenMPSimdDirective(S.getDirectiveKind())) {
-      CGF.EmitOMPSimdInit(S);
-      (void)CGF.EmitOMPLinearClauseInit(S);
-    }
+    (void)CGF.EmitOMPLinearClauseInit(S);
 
     OMPPrivateScope LoopScope(CGF);
     // Emit helper vars inits.
@@ -5140,13 +5142,24 @@ void CodeGenFunction::EmitOMPTaskLoopBasedDirective(const OMPLoopDirective &S) {
       CGF.EmitIgnoredExpr(S.getCalcLastIteration());
     }
 
-    CGF.EmitOMPInnerLoop(S, LoopScope.requiresCleanups(), S.getCond(),
-                         S.getInc(),
-                         [&S](CodeGenFunction &CGF) {
-                           CGF.EmitOMPLoopBody(S, JumpDest());
-                           CGF.EmitStopPoint(&S);
-                         },
-                         [](CodeGenFunction &) {});
+    {
+      OMPLexicalScope Scope(CGF, S, OMPD_taskloop, /*EmitPreInitStmt=*/false);
+      emitCommonSimdLoop(
+          CGF, S,
+          [&S](CodeGenFunction &CGF, PrePostActionTy &) {
+            if (isOpenMPSimdDirective(S.getDirectiveKind()))
+              CGF.EmitOMPSimdInit(S);
+          },
+          [&S, &LoopScope](CodeGenFunction &CGF, PrePostActionTy &) {
+            CGF.EmitOMPInnerLoop(
+                S, LoopScope.requiresCleanups(), S.getCond(), S.getInc(),
+                [&S](CodeGenFunction &CGF) {
+                  CGF.EmitOMPLoopBody(S, CodeGenFunction::JumpDest());
+                  CGF.EmitStopPoint(&S);
+                },
+                [](CodeGenFunction &) {});
+          });
+    }
     // Emit: if (PreCond) - end.
     if (ContBlock) {
       CGF.EmitBranch(ContBlock);
@@ -5200,6 +5213,7 @@ void CodeGenFunction::EmitOMPTaskLoopDirective(const OMPTaskLoopDirective &S) {
 
 void CodeGenFunction::EmitOMPTaskLoopSimdDirective(
     const OMPTaskLoopSimdDirective &S) {
+  OMPLexicalScope Scope(*this, S);
   EmitOMPTaskLoopBasedDirective(S);
 }
 
@@ -5219,7 +5233,7 @@ void CodeGenFunction::EmitOMPMasterTaskLoopSimdDirective(
     Action.Enter(CGF);
     EmitOMPTaskLoopBasedDirective(S);
   };
-  OMPLexicalScope Scope(*this, S, llvm::None, /*EmitPreInitStmt=*/false);
+  OMPLexicalScope Scope(*this, S);
   CGM.getOpenMPRuntime().emitMasterRegion(*this, CodeGen, S.getBeginLoc());
 }
 
@@ -5292,7 +5306,7 @@ void CodeGenFunction::EmitSimpleOMPExecutableDirective(
           if (!VD->hasLocalStorage() && !CGF.LocalDeclMap.count(VD)) {
             LValue GlobLVal = CGF.EmitLValue(E);
             LoopGlobals.addPrivate(
-                VD, [&GlobLVal]() { return GlobLVal.getAddress(); });
+                VD, [&GlobLVal, &CGF]() { return GlobLVal.getAddress(CGF); });
           }
           if (isa<OMPCapturedExprDecl>(VD)) {
             // Emit only those that were not explicitly referenced in clauses.
diff --git a/clang/lib/CodeGen/CGValue.h b/clang/lib/CodeGen/CGValue.h
index 71f95abe488a9..9fd07bdb187d4 100644
--- a/clang/lib/CodeGen/CGValue.h
+++ b/clang/lib/CodeGen/CGValue.h
@@ -29,6 +29,7 @@ namespace llvm {
 namespace clang {
 namespace CodeGen {
   class AggValueSlot;
+  class CodeGenFunction;
   struct CGBitFieldInfo;
 
 /// RValue - This trivial value class is used to represent the result of an
@@ -319,11 +320,13 @@ class LValue {
   void setBaseInfo(LValueBaseInfo Info) { BaseInfo = Info; }
 
   // simple lvalue
-  llvm::Value *getPointer() const {
+  llvm::Value *getPointer(CodeGenFunction &CGF) const {
     assert(isSimple());
     return V;
   }
-  Address getAddress() const { return Address(getPointer(), getAlignment()); }
+  Address getAddress(CodeGenFunction &CGF) const {
+    return Address(getPointer(CGF), getAlignment());
+  }
   void setAddress(Address address) {
     assert(isSimple());
     V = address.getPointer();
@@ -427,8 +430,8 @@ class LValue {
     return R;
   }
 
-  RValue asAggregateRValue() const {
-    return RValue::getAggregate(getAddress(), isVolatileQualified());
+  RValue asAggregateRValue(CodeGenFunction &CGF) const {
+    return RValue::getAggregate(getAddress(CGF), isVolatileQualified());
   }
 };
 
@@ -536,14 +539,12 @@ class AggValueSlot {
     return AV;
   }
 
-  static AggValueSlot forLValue(const LValue &LV,
-                                IsDestructed_t isDestructed,
-                                NeedsGCBarriers_t needsGC,
-                                IsAliased_t isAliased,
-                                Overlap_t mayOverlap,
-                                IsZeroed_t isZeroed = IsNotZeroed,
-                       IsSanitizerChecked_t isChecked = IsNotSanitizerChecked) {
-    return forAddr(LV.getAddress(), LV.getQuals(), isDestructed, needsGC,
+  static AggValueSlot
+  forLValue(const LValue &LV, CodeGenFunction &CGF, IsDestructed_t isDestructed,
+            NeedsGCBarriers_t needsGC, IsAliased_t isAliased,
+            Overlap_t mayOverlap, IsZeroed_t isZeroed = IsNotZeroed,
+            IsSanitizerChecked_t isChecked = IsNotSanitizerChecked) {
+    return forAddr(LV.getAddress(CGF), LV.getQuals(), isDestructed, needsGC,
                    isAliased, mayOverlap, isZeroed, isChecked);
   }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index e2707fe2bec8d..475d015ecf268 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -34,6 +34,8 @@
 #include "clang/Frontend/FrontendDiagnostic.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/FPEnv.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
@@ -88,6 +90,7 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
     FMF.setAllowReassoc();
   }
   Builder.setFastMathFlags(FMF);
+  SetFPModel();
 }
 
 CodeGenFunction::~CodeGenFunction() {
@@ -103,6 +106,51 @@ CodeGenFunction::~CodeGenFunction() {
     CGM.getOpenMPRuntime().functionFinished(*this);
 }
 
+// Map the LangOption for rounding mode into
+// the corresponding enum in the IR.
+static llvm::fp::RoundingMode ToConstrainedRoundingMD(
+  LangOptions::FPRoundingModeKind Kind) {
+
+  switch (Kind) {
+  case LangOptions::FPR_ToNearest:  return llvm::fp::rmToNearest;
+  case LangOptions::FPR_Downward:   return llvm::fp::rmDownward;
+  case LangOptions::FPR_Upward:     return llvm::fp::rmUpward;
+  case LangOptions::FPR_TowardZero: return llvm::fp::rmTowardZero;
+  case LangOptions::FPR_Dynamic:    return llvm::fp::rmDynamic;
+  }
+  llvm_unreachable("Unsupported FP RoundingMode");
+}
+
+// Map the LangOption for exception behavior into
+// the corresponding enum in the IR.
+static llvm::fp::ExceptionBehavior ToConstrainedExceptMD(
+  LangOptions::FPExceptionModeKind Kind) {
+
+  switch (Kind) {
+  case LangOptions::FPE_Ignore:  return llvm::fp::ebIgnore;
+  case LangOptions::FPE_MayTrap: return llvm::fp::ebMayTrap;
+  case LangOptions::FPE_Strict:  return llvm::fp::ebStrict;
+  }
+  llvm_unreachable("Unsupported FP Exception Behavior");
+}
+
+void CodeGenFunction::SetFPModel() {
+  auto fpRoundingMode = ToConstrainedRoundingMD(
+                          getLangOpts().getFPRoundingMode());
+  auto fpExceptionBehavior = ToConstrainedExceptMD(
+                               getLangOpts().getFPExceptionMode());
+
+  if (fpExceptionBehavior == llvm::fp::ebIgnore &&
+      fpRoundingMode == llvm::fp::rmToNearest)
+    // Constrained intrinsics are not used.
+    ;
+  else {
+    Builder.setIsFPConstrained(true);
+    Builder.setDefaultConstrainedRounding(fpRoundingMode);
+    Builder.setDefaultConstrainedExcept(fpExceptionBehavior);
+  }
+}
+
 CharUnits CodeGenFunction::getNaturalPointeeTypeAlignment(QualType T,
                                                     LValueBaseInfo *BaseInfo,
                                                     TBAAAccessInfo *TBAAInfo) {
@@ -841,6 +889,10 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       if (FD->isMain())
         Fn->addFnAttr(llvm::Attribute::NoRecurse);
 
+  if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
+    if (FD->usesFPIntrin())
+      Fn->addFnAttr(llvm::Attribute::StrictFP);
+
   // If a custom alignment is used, force realigning to this alignment on
   // any main function which certainly will need it.
   if (const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D))
@@ -999,7 +1051,7 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
         LValue ThisFieldLValue = EmitLValueForLambdaField(LambdaThisCaptureField);
         if (!LambdaThisCaptureField->getType()->isPointerType()) {
           // If the enclosing object was captured by value, just use its address.
-          CXXThisValue = ThisFieldLValue.getAddress().getPointer();
+          CXXThisValue = ThisFieldLValue.getAddress(*this).getPointer();
         } else {
           // Load the lvalue pointed to by the field, since '*this' was captured
           // by reference.
@@ -2036,11 +2088,11 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) {
 Address CodeGenFunction::EmitVAListRef(const Expr* E) {
   if (getContext().getBuiltinVaListType()->isArrayType())
     return EmitPointerWithAlignment(E);
-  return EmitLValue(E).getAddress();
+  return EmitLValue(E).getAddress(*this);
 }
 
 Address CodeGenFunction::EmitMSVAListRef(const Expr *E) {
-  return EmitLValue(E).getAddress();
+  return EmitLValue(E).getAddress(*this);
 }
 
 void CodeGenFunction::EmitDeclRefExprDbgValue(const DeclRefExpr *E,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 0c6fcb89f6ba0..c1718cade2f88 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4178,6 +4178,9 @@ class CodeGenFunction : public CodeGenTypeCache {
   /// point operation, expressed as the maximum relative error in ulp.
   void SetFPAccuracy(llvm::Value *Val, float Accuracy);
 
+  /// SetFPModel - Control floating point behavior via fp-model settings.
+  void SetFPModel();
+
 private:
   llvm::MDNode *getRangeForLoadFromType(QualType Ty);
   void EmitReturnOfRValue(RValue RV, QualType Ty);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 7432233f8afc9..306969aea522f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -74,6 +74,7 @@ static const char AnnotationSection[] = "llvm.metadata";
 
 static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
   switch (CGM.getTarget().getCXXABI().getKind()) {
+  case TargetCXXABI::Fuchsia:
   case TargetCXXABI::GenericAArch64:
   case TargetCXXABI::GenericARM:
   case TargetCXXABI::iOS:
diff --git a/clang/lib/CodeGen/ConstantEmitter.h b/clang/lib/CodeGen/ConstantEmitter.h
index 59a19730f4ebc..121acbac4fa91 100644
--- a/clang/lib/CodeGen/ConstantEmitter.h
+++ b/clang/lib/CodeGen/ConstantEmitter.h
@@ -23,7 +23,7 @@ namespace CodeGen {
 class ConstantEmitter {
 public:
   CodeGenModule &CGM;
-  CodeGenFunction *CGF;
+  CodeGenFunction *const CGF;
 
 private:
   bool Abstract = false;
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 0a7a4fe33ac2d..bdecff39c88fd 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -1114,8 +1114,8 @@ struct CounterCoverageMappingBuilder
         // Make a region for the body of the switch.  If the body starts with
         // a case, that case will reuse this region; otherwise, this covers
         // the unreachable code at the beginning of the switch body.
-        size_t Index =
-            pushRegion(Counter::getZero(), getStart(CS->body_front()));
+        size_t Index = pushRegion(Counter::getZero(), getStart(CS));
+        getRegion().setGap(true);
         for (const auto *Child : CS->children())
           Visit(Child);
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 8f9b16470b642..515eb3f1f168d 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -487,6 +487,19 @@ class iOS64CXXABI : public ARMCXXABI {
   bool shouldRTTIBeUnique() const override { return false; }
 };
 
+class FuchsiaCXXABI final : public ItaniumCXXABI {
+public:
+  explicit FuchsiaCXXABI(CodeGen::CodeGenModule &CGM)
+      : ItaniumCXXABI(CGM) {}
+
+private:
+  bool HasThisReturn(GlobalDecl GD) const override {
+    return isa<CXXConstructorDecl>(GD.getDecl()) ||
+           (isa<CXXDestructorDecl>(GD.getDecl()) &&
+            GD.getDtorType() != Dtor_Deleting);
+  }
+};
+
 class WebAssemblyCXXABI final : public ItaniumCXXABI {
 public:
   explicit WebAssemblyCXXABI(CodeGen::CodeGenModule &CGM)
@@ -516,6 +529,9 @@ CodeGen::CGCXXABI *CodeGen::CreateItaniumCXXABI(CodeGenModule &CGM) {
   case TargetCXXABI::iOS64:
     return new iOS64CXXABI(CGM);
 
+  case TargetCXXABI::Fuchsia:
+    return new FuchsiaCXXABI(CGM);
+
   // Note that AArch64 uses the generic ItaniumCXXABI class since it doesn't
   // include the other 32-bit ARM oddities: constructor/destructor return values
   // and array cookies.
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 8196df614cee8..800d02d5d0394 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -1343,6 +1343,13 @@ void MicrosoftCXXABI::EmitCXXDestructors(const CXXDestructorDecl *D) {
   // The TU defining a dtor is only guaranteed to emit a base destructor.  All
   // other destructor variants are delegating thunks.
   CGM.EmitGlobal(GlobalDecl(D, Dtor_Base));
+
+  // If the class is dllexported, emit the complete (vbase) destructor wherever
+  // the base dtor is emitted.
+  // FIXME: To match MSVC, this should only be done when the class is exported
+  // with -fdllexport-inlines enabled.
+  if (D->getParent()->getNumVBases() > 0 && D->hasAttr<DLLExportAttr>())
+    CGM.EmitGlobal(GlobalDecl(D, Dtor_Complete));
 }
 
 CharUnits
diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
index ec848a312ae01..97bea0150e7f7 100644
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -1225,7 +1225,7 @@ void X86_32TargetCodeGenInfo::addReturnRegisterOutputs(
   ResultTruncRegTypes.push_back(CoerceTy);
 
   // Coerce the integer by bitcasting the return slot pointer.
-  ReturnSlot.setAddress(CGF.Builder.CreateBitCast(ReturnSlot.getAddress(),
+  ReturnSlot.setAddress(CGF.Builder.CreateBitCast(ReturnSlot.getAddress(CGF),
                                                   CoerceTy->getPointerTo()));
   ResultRegDests.push_back(ReturnSlot);
 
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index 92e04108a7e29..06707fefc9d08 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -13,11 +13,28 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace clang::driver;
 using namespace clang;
 
-static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
+static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS,
+                                       const llvm::Triple &TargetOrHost) {
+  // If we don't target Linux, no need to check the distro. This saves a few
+  // OS calls.
+  if (!TargetOrHost.isOSLinux())
+    return Distro::UnknownDistro;
+
+  // If the host is not running Linux, and we're backed by a real file system,
+  // no need to check the distro. This is the case where someone is
+  // cross-compiling from BSD or Windows to Linux, and it would be meaningless
+  // to try to figure out the "distro" of the non-Linux host.
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> RealFS =
+      llvm::vfs::getRealFileSystem();
+  llvm::Triple HostTriple(llvm::sys::getProcessTriple());
+  if (!HostTriple.isOSLinux() && &VFS == RealFS.get())
+    return Distro::UnknownDistro;
+
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> File =
       VFS.getBufferForFile("/etc/lsb-release");
   if (File) {
@@ -149,4 +166,5 @@ static Distro::DistroType DetectDistro(llvm::vfs::FileSystem &VFS) {
   return Distro::UnknownDistro;
 }
 
-Distro::Distro(llvm::vfs::FileSystem &VFS) : DistroVal(DetectDistro(VFS)) {}
+Distro::Distro(llvm::vfs::FileSystem &VFS, const llvm::Triple &TargetOrHost)
+    : DistroVal(DetectDistro(VFS, TargetOrHost)) {}
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 50450b7deb567..6fbff61f76565 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -20,6 +20,62 @@ using namespace clang::driver::tools;
 
 using namespace llvm::opt;
 
+void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
+                                  const InputInfo &Output,
+                                  const InputInfoList &Inputs,
+                                  const ArgList &Args,
+                                  const char *LinkingOutput) const {
+  ArgStringList CmdArgs;
+
+  const bool IsArch32Bit = getToolChain().getTriple().isArch32Bit();
+  const bool IsArch64Bit = getToolChain().getTriple().isArch64Bit();
+  // Only support 32 and 64 bit.
+  if (!IsArch32Bit && !IsArch64Bit)
+    llvm_unreachable("Unsupported bit width value.");
+
+  // Specify the mode in which the as(1) command operates.
+  if (IsArch32Bit) {
+    CmdArgs.push_back("-a32");
+  } else {
+    // Must be 64-bit, otherwise asserted already.
+    CmdArgs.push_back("-a64");
+  }
+
+  // Accept an undefined symbol as an extern so that an error message is not
+  // displayed. Otherwise, undefined symbols are flagged with error messages.
+  // FIXME: This should be removed when the assembly generation from the
+  // compiler is able to write externs properly.
+  CmdArgs.push_back("-u");
+
+  // Accept any mixture of instructions.
+  // On Power for AIX and Linux, this behaviour matches that of GCC for both the
+  // user-provided assembler source case and the compiler-produced assembler
+  // source case. Yet XL with user-provided assembler source would not add this.
+  CmdArgs.push_back("-many");
+
+  Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);
+
+  // Specify assembler output file.
+  assert((Output.isFilename() || Output.isNothing()) && "Invalid output.");
+  if (Output.isFilename()) {
+    CmdArgs.push_back("-o");
+    CmdArgs.push_back(Output.getFilename());
+  }
+
+  // Specify assembler input file.
+  // The system assembler on AIX takes exactly one input file. The driver is
+  // expected to invoke as(1) separately for each assembler source input file.
+  if (Inputs.size() != 1)
+    llvm_unreachable("Invalid number of input files.");
+  const InputInfo &II = Inputs[0];
+  assert((II.isFilename() || II.isNothing()) && "Invalid input.");
+  if (II.isFilename())
+    CmdArgs.push_back(II.getFilename());
+
+  const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
+  C.addCommand(std::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
+}
+
 void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                const InputInfo &Output,
                                const InputInfoList &Inputs, const ArgList &Args,
@@ -42,7 +98,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Output.isFilename()) {
     CmdArgs.push_back("-o");
     CmdArgs.push_back(Output.getFilename());
-  } 
+  }
 
   // Set linking mode (i.e., 32/64-bit) and the address of
   // text and data sections based on arch bit width.
@@ -92,11 +148,12 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(std::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
-/// AIX - AIX tool chain which can call ld(1) directly.
-// TODO: Enable direct call to as(1).
+/// AIX - AIX tool chain which can call as(1) and ld(1) directly.
 AIX::AIX(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
     : ToolChain(D, Triple, Args) {
   getFilePaths().push_back(getDriver().SysRoot + "/usr/lib");
 }
 
+auto AIX::buildAssembler() const -> Tool * { return new aix::Assembler(*this); }
+
 auto AIX::buildLinker() const -> Tool * { return new aix::Linker(*this); }
diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h
index 58c06c3e4413e..69b948bc0ea82 100644
--- a/clang/lib/Driver/ToolChains/AIX.h
+++ b/clang/lib/Driver/ToolChains/AIX.h
@@ -16,10 +16,21 @@ namespace clang {
 namespace driver {
 namespace tools {
 
-/// aix -- Directly call system default linker.
-// TODO: Enable direct call to system default assembler.
+/// aix -- Directly call system default assembler and linker.
 namespace aix {
 
+class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+public:
+  Assembler(const ToolChain &TC) : Tool("aix::Assembler", "assembler", TC) {}
+
+  bool hasIntegratedCPP() const override { return false; }
+
+  void ConstructJob(Compilation &C, const JobAction &JA,
+                    const InputInfo &Output, const InputInfoList &Inputs,
+                    const llvm::opt::ArgList &TCArgs,
+                    const char *LinkingOutput) const override;
+};
+
 class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("aix::Linker", "linker", TC) {}
@@ -53,6 +64,7 @@ class LLVM_LIBRARY_VISIBILITY AIX : public ToolChain {
   bool isPICDefaultForced() const override { return true; }
 
 protected:
+  Tool *buildAssembler() const override;
   Tool *buildLinker() const override;
 };
 
diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index cca47722c2044..68a57310ad402 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -592,39 +592,11 @@ void arm::getARMTargetFeatures(const ToolChain &TC,
       Features.push_back("+strict-align");
   }
 
-  // Do not allow r9 reservation with -frwpi.
-  if (Args.hasArg(options::OPT_ffixed_r9) && Args.hasArg(options::OPT_frwpi)) {
-    Arg *A = Args.getLastArg(options::OPT_ffixed_r9);
-    Arg *B = Args.getLastArg(options::OPT_frwpi);
-    D.Diag(diag::err_opt_not_valid_with_opt)
-        << A->getAsString(Args) << B->getAsString(Args);
-  }
-
-  // The compiler can still use a FP in certain circumstances,
-  // even when frame pointer elimination is enabled. Thus we should
-  // not allow to reserve a target's FP register.
-  const llvm::opt::OptSpecifier RestrictFPOpt =
-      (Triple.isOSDarwin() || (!Triple.isOSWindows() && Triple.isThumb()))
-          ? options::OPT_ffixed_r7
-          : options::OPT_ffixed_r11;
-  if (Args.hasArg(RestrictFPOpt)) {
-    const std::string OptStr =
-        Args.getLastArg(RestrictFPOpt)->getAsString(Args);
-    const unsigned int SubStrIndex = strlen("ffixed-r");
-    D.Diag(diag::err_reserved_frame_pointer)
-        << OptStr << OptStr.substr(SubStrIndex);
-  }
-
-// Reservation of general purpose registers.
-#define HANDLE_FFIXED_R(n) \
-  if (Args.hasArg(options::OPT_ffixed_r##n)) \
-    Features.push_back("+reserve-r" #n)
-  HANDLE_FFIXED_R(6);
-  HANDLE_FFIXED_R(7);
-  HANDLE_FFIXED_R(8);
-  HANDLE_FFIXED_R(9);
-  HANDLE_FFIXED_R(10);
-  HANDLE_FFIXED_R(11);
+  // llvm does not support reserving registers in general. There is support
+  // for reserving r9 on ARM though (defined as a platform-specific register
+  // in ARM EABI).
+  if (Args.hasArg(options::OPT_ffixed_r9))
+    Features.push_back("+reserve-r9");
 
   // The kext linker doesn't know how to deal with movw/movt.
   if (KernelOrKext || Args.hasArg(options::OPT_mno_movt))
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index ddd1174a75834..917b40f103937 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -635,16 +635,33 @@ static void addDebugCompDirArg(const ArgList &Args, ArgStringList &CmdArgs,
 
 /// Add a CC1 and CC1AS option to specify the debug file path prefix map.
 static void addDebugPrefixMapArg(const Driver &D, const ArgList &Args, ArgStringList &CmdArgs) {
-  for (const Arg *A : Args.filtered(options::OPT_fdebug_prefix_map_EQ)) {
+  for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
+                                    options::OPT_fdebug_prefix_map_EQ)) {
     StringRef Map = A->getValue();
     if (Map.find('=') == StringRef::npos)
-      D.Diag(diag::err_drv_invalid_argument_to_fdebug_prefix_map) << Map;
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Map << A->getOption().getName();
     else
       CmdArgs.push_back(Args.MakeArgString("-fdebug-prefix-map=" + Map));
     A->claim();
   }
 }
 
+/// Add a CC1 and CC1AS option to specify the macro file path prefix map.
+static void addMacroPrefixMapArg(const Driver &D, const ArgList &Args,
+                                 ArgStringList &CmdArgs) {
+  for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
+                                    options::OPT_fmacro_prefix_map_EQ)) {
+    StringRef Map = A->getValue();
+    if (Map.find('=') == StringRef::npos)
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Map << A->getOption().getName();
+    else
+      CmdArgs.push_back(Args.MakeArgString("-fmacro-prefix-map=" + Map));
+    A->claim();
+  }
+}
+
 /// Vectorize at all optimization levels greater than 1 except for -Oz.
 /// For -Oz the loop vectorizer is disabled, while the slp vectorizer is
 /// enabled.
@@ -1355,6 +1372,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
     // For IAMCU add special include arguments.
     getToolChain().AddIAMCUIncludeArgs(Args, CmdArgs);
   }
+
+  addMacroPrefixMapArg(D, Args, CmdArgs);
 }
 
 // FIXME: Move to target hook.
@@ -2295,9 +2314,18 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   bool AssociativeMath = false;
   bool ReciprocalMath = false;
   bool SignedZeros = true;
-  bool TrappingMath = true;
+  bool TrappingMath = false; // Implemented via -ffp-exception-behavior
+  bool TrappingMathPresent = false; // Is trapping-math in args, and not
+                                    // overriden by ffp-exception-behavior?
+  bool RoundingFPMath = false;
+  bool RoundingMathPresent = false; // Is rounding-math in args?
+  // -ffp-model values: strict, fast, precise
+  StringRef FPModel = "";
+  // -ffp-exception-behavior options: strict, maytrap, ignore
+  StringRef FPExceptionBehavior = "";
   StringRef DenormalFPMath = "";
   StringRef FPContract = "";
+  bool StrictFPModel = false;
 
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
@@ -2305,7 +2333,73 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   }
 
   for (const Arg *A : Args) {
-    switch (A->getOption().getID()) {
+    auto optID = A->getOption().getID();
+    bool PreciseFPModel = false;
+    switch (optID) {
+    default:
+      break;
+    case options::OPT_frounding_math:
+    case options::OPT_ftrapping_math:
+    case options::OPT_ffp_exception_behavior_EQ:
+      D.Diag(clang::diag::warn_drv_experimental_fp_control_incomplete_opt)
+          << A->getOption().getName();
+      break;
+    case options::OPT_ffp_model_EQ: {
+      D.Diag(clang::diag::warn_drv_experimental_fp_control_incomplete_opt)
+          << A->getOption().getName();
+      // If -ffp-model= is seen, reset to fno-fast-math
+      HonorINFs = true;
+      HonorNaNs = true;
+      // Turning *off* -ffast-math restores the toolchain default.
+      MathErrno = TC.IsMathErrnoDefault();
+      AssociativeMath = false;
+      ReciprocalMath = false;
+      SignedZeros = true;
+      // -fno_fast_math restores default denormal and fpcontract handling
+      DenormalFPMath = "";
+      FPContract = "";
+      StringRef Val = A->getValue();
+      if (OFastEnabled && !Val.equals("fast")) {
+          // Only -ffp-model=fast is compatible with OFast, ignore.
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << Args.MakeArgString("-ffp-model=" + Val)
+          << "-Ofast";
+        break;
+      }
+      StrictFPModel = false;
+      PreciseFPModel = true;
+      // ffp-model= is a Driver option, it is entirely rewritten into more
+      // granular options before being passed into cc1.
+      // Use the gcc option in the switch below.
+      if (!FPModel.empty() && !FPModel.equals(Val)) {
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << Args.MakeArgString("-ffp-model=" + FPModel)
+          << Args.MakeArgString("-ffp-model=" + Val);
+        FPContract = "";
+      }
+      if (Val.equals("fast")) {
+        optID = options::OPT_ffast_math;
+        FPModel = Val;
+        FPContract = "fast";
+      } else if (Val.equals("precise")) {
+        optID = options::OPT_ffp_contract;
+        FPModel = Val;
+        FPContract = "fast";
+        PreciseFPModel = true;
+      } else if (Val.equals("strict")) {
+        StrictFPModel = true;
+        optID = options::OPT_frounding_math;
+        FPExceptionBehavior = "strict";
+        FPModel = Val;
+        TrappingMath = true;
+      } else
+        D.Diag(diag::err_drv_unsupported_option_argument)
+            << A->getOption().getName() << Val;
+      break;
+      }
+    }
+
+    switch (optID) {
     // If this isn't an FP option skip the claim below
     default: continue;
 
@@ -2322,19 +2416,82 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_fno_reciprocal_math:  ReciprocalMath = false;   break;
     case options::OPT_fsigned_zeros:        SignedZeros = true;       break;
     case options::OPT_fno_signed_zeros:     SignedZeros = false;      break;
-    case options::OPT_ftrapping_math:       TrappingMath = true;      break;
-    case options::OPT_fno_trapping_math:    TrappingMath = false;     break;
+    case options::OPT_ftrapping_math:
+      if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
+          !FPExceptionBehavior.equals("strict"))
+        // Warn that previous value of option is overridden.
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
+          << "-ftrapping-math";
+      TrappingMath = true;
+      TrappingMathPresent = true;
+      FPExceptionBehavior = "strict";
+      break;
+    case options::OPT_fno_trapping_math:
+      if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
+          !FPExceptionBehavior.equals("ignore"))
+        // Warn that previous value of option is overridden.
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
+          << "-fno-trapping-math";
+      TrappingMath = false;
+      TrappingMathPresent = true;
+      FPExceptionBehavior = "ignore";
+      break;
+
+    case options::OPT_frounding_math:
+      RoundingFPMath = true;
+      RoundingMathPresent = true;
+      break;
+
+    case options::OPT_fno_rounding_math:
+      RoundingFPMath = false;
+      RoundingMathPresent = false;
+      break;
 
     case options::OPT_fdenormal_fp_math_EQ:
       DenormalFPMath = A->getValue();
       break;
 
-    // Validate and pass through -fp-contract option.
+    // Validate and pass through -ffp-contract option.
     case options::OPT_ffp_contract: {
       StringRef Val = A->getValue();
-      if (Val == "fast" || Val == "on" || Val == "off")
+      if (PreciseFPModel) {
+        // -ffp-model=precise enables ffp-contract=fast as a side effect
+        // the FPContract value has already been set to a string literal
+        // and the Val string isn't a pertinent value.
+        ;
+      } else if (Val.equals("fast") || Val.equals("on") || Val.equals("off"))
         FPContract = Val;
       else
+        D.Diag(diag::err_drv_unsupported_option_argument)
+           << A->getOption().getName() << Val;
+      break;
+    }
+
+    // Validate and pass through -ffp-model option.
+    case options::OPT_ffp_model_EQ:
+      // This should only occur in the error case
+      // since the optID has been replaced by a more granular
+      // floating point option.
+      break;
+
+    // Validate and pass through -ffp-exception-behavior option.
+    case options::OPT_ffp_exception_behavior_EQ: {
+      StringRef Val = A->getValue();
+      if (!TrappingMathPresent && !FPExceptionBehavior.empty() &&
+          !FPExceptionBehavior.equals(Val))
+        // Warn that previous value of option is overridden.
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << Args.MakeArgString("-ffp-exception-behavior=" + FPExceptionBehavior)
+          << Args.MakeArgString("-ffp-exception-behavior=" + Val);
+      TrappingMath = TrappingMathPresent = false;
+      if (Val.equals("ignore") || Val.equals("maytrap"))
+        FPExceptionBehavior = Val;
+      else if (Val.equals("strict")) {
+        FPExceptionBehavior = Val;
+        TrappingMath = TrappingMathPresent = true;
+      } else
         D.Diag(diag::err_drv_unsupported_option_argument)
             << A->getOption().getName() << Val;
       break;
@@ -2354,12 +2511,14 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = true;
       SignedZeros = false;
       TrappingMath = false;
+      FPExceptionBehavior = "";
       break;
     case options::OPT_fno_unsafe_math_optimizations:
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
       TrappingMath = true;
+      FPExceptionBehavior = "strict";
       // -fno_unsafe_math_optimizations restores default denormal handling
       DenormalFPMath = "";
       break;
@@ -2377,6 +2536,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       ReciprocalMath = true;
       SignedZeros = false;
       TrappingMath = false;
+      RoundingFPMath = false;
       // If fast-math is set then set the fp-contract mode to fast.
       FPContract = "fast";
       break;
@@ -2390,12 +2550,31 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       AssociativeMath = false;
       ReciprocalMath = false;
       SignedZeros = true;
-      TrappingMath = true;
+      TrappingMath = false;
+      RoundingFPMath = false;
       // -fno_fast_math restores default denormal and fpcontract handling
       DenormalFPMath = "";
       FPContract = "";
       break;
     }
+    if (StrictFPModel) {
+      // If -ffp-model=strict has been specified on command line but
+      // subsequent options conflict then emit warning diagnostic.
+      if (HonorINFs && HonorNaNs &&
+        !AssociativeMath && !ReciprocalMath &&
+        SignedZeros && TrappingMath && RoundingFPMath &&
+        DenormalFPMath.empty() && FPContract.empty())
+        // OK: Current Arg doesn't conflict with -ffp-model=strict
+        ;
+      else {
+        StrictFPModel = false;
+        FPModel = "";
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+            << "-ffp-model=strict" <<
+            ((A->getNumValues() == 0) ?  A->getSpelling()
+            : Args.MakeArgString(A->getSpelling() + A->getValue()));
+      }
+    }
 
     // If we handled this option claim it
     A->claim();
@@ -2423,7 +2602,11 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   if (ReciprocalMath)
     CmdArgs.push_back("-freciprocal-math");
 
-  if (!TrappingMath)
+  if (TrappingMath) {
+    // FP Exception Behavior is also set to strict
+    assert(FPExceptionBehavior.equals("strict"));
+    CmdArgs.push_back("-ftrapping-math");
+  } else if (TrappingMathPresent)
     CmdArgs.push_back("-fno-trapping-math");
 
   if (!DenormalFPMath.empty())
@@ -2433,14 +2616,37 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   if (!FPContract.empty())
     CmdArgs.push_back(Args.MakeArgString("-ffp-contract=" + FPContract));
 
+  if (!RoundingFPMath)
+    CmdArgs.push_back(Args.MakeArgString("-fno-rounding-math"));
+
+  if (RoundingFPMath && RoundingMathPresent)
+    CmdArgs.push_back(Args.MakeArgString("-frounding-math"));
+
+  if (!FPExceptionBehavior.empty())
+    CmdArgs.push_back(Args.MakeArgString("-ffp-exception-behavior=" +
+                      FPExceptionBehavior));
+
   ParseMRecip(D, Args, CmdArgs);
 
   // -ffast-math enables the __FAST_MATH__ preprocessor macro, but check for the
   // individual features enabled by -ffast-math instead of the option itself as
   // that's consistent with gcc's behaviour.
   if (!HonorINFs && !HonorNaNs && !MathErrno && AssociativeMath &&
-      ReciprocalMath && !SignedZeros && !TrappingMath)
+      ReciprocalMath && !SignedZeros && !TrappingMath && !RoundingFPMath) {
     CmdArgs.push_back("-ffast-math");
+    if (FPModel.equals("fast")) {
+      if (FPContract.equals("fast"))
+        // All set, do nothing.
+        ;
+      else if (FPContract.empty())
+        // Enable -ffp-contract=fast
+        CmdArgs.push_back(Args.MakeArgString("-ffp-contract=fast"));
+      else
+        D.Diag(clang::diag::warn_drv_overriding_flag_option)
+          << "-ffp-model=fast"
+          << Args.MakeArgString("-ffp-contract=" + FPContract);
+    }
+  }
 
   // Handle __FINITE_MATH_ONLY__ similarly.
   if (!HonorINFs && !HonorNaNs)
@@ -4565,9 +4771,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_fexperimental_new_constant_interpreter))
     CmdArgs.push_back("-fexperimental-new-constant-interpreter");
 
-  if (Args.hasArg(options::OPT_fforce_experimental_new_constant_interpreter))
-    CmdArgs.push_back("-fforce-experimental-new-constant-interpreter");
-
   if (Arg *A = Args.getLastArg(options::OPT_fbracket_depth_EQ)) {
     CmdArgs.push_back("-fbracket-depth");
     CmdArgs.push_back(A->getValue());
@@ -5736,7 +5939,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                     TC.getTriple().isOSBinFormatCOFF()) &&
                       !TC.getTriple().isPS4() &&
                       !TC.getTriple().isOSNetBSD() &&
-                      !Distro(D.getVFS()).IsGentoo() &&
+                      !Distro(D.getVFS(), TC.getTriple()).IsGentoo() &&
                       !TC.getTriple().isAndroid() &&
                        TC.useIntegratedAs()))
     CmdArgs.push_back("-faddrsig");
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index 8c704a3078adc..02871d2ce411f 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -115,7 +115,8 @@ CudaInstallationDetector::CudaInstallationDetector(
     for (const char *Ver : Versions)
       Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);
 
-    if (Distro(D.getVFS()).IsDebian() || Distro(D.getVFS()).IsUbuntu())
+    Distro Dist(D.getVFS(), llvm::Triple(llvm::sys::getProcessTriple()));
+    if (Dist.IsDebian() || Dist.IsUbuntu())
       // Special case for Debian to have nvidia-cuda-toolkit work
       // out of the box. More info on http://bugs.debian.org/882505
       Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 3e5e8a00652d3..85e94fe018e6a 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -12,6 +12,7 @@
 #include "Arch/Sparc.h"
 #include "CommonArgs.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "clang/Driver/SanitizerArgs.h"
 #include "llvm/Option/ArgList.h"
@@ -30,6 +31,7 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                       const char *LinkingOutput) const {
   claimNoWarnArgs(Args);
   ArgStringList CmdArgs;
+  const auto &D = getToolChain().getDriver();
 
   // When building 32-bit code on FreeBSD/amd64, we have to explicitly
   // instruct as in the base system to assemble 32-bit code.
@@ -103,6 +105,19 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   }
   }
 
+  for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
+                                    options::OPT_fdebug_prefix_map_EQ)) {
+    StringRef Map = A->getValue();
+    if (Map.find('=') == StringRef::npos)
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Map << A->getOption().getName();
+    else {
+      CmdArgs.push_back(Args.MakeArgString("--debug-prefix-map"));
+      CmdArgs.push_back(Args.MakeArgString(Map));
+    }
+    A->claim();
+  }
+
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);
 
   CmdArgs.push_back("-o");
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 9bea0b15c8739..4c5d4003f1442 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -351,10 +351,9 @@ SanitizerMask Fuchsia::getDefaultSanitizers() const {
   case llvm::Triple::x86_64:
     Res |= SanitizerKind::SafeStack;
     break;
-  case llvm::Triple::riscv64:
-    break;
   default:
-    llvm_unreachable("invalid architecture");
+    // TODO: Enable SafeStack on RISC-V once tested.
+    break;
   }
   return Res;
 }
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index abcf4377fe718..fb13474e0791c 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -929,6 +929,19 @@ void tools::gnutools::Assembler::ConstructJob(Compilation &C,
   }
   }
 
+  for (const Arg *A : Args.filtered(options::OPT_ffile_prefix_map_EQ,
+                                    options::OPT_fdebug_prefix_map_EQ)) {
+    StringRef Map = A->getValue();
+    if (Map.find('=') == StringRef::npos)
+      D.Diag(diag::err_drv_invalid_argument_to_option)
+          << Map << A->getOption().getName();
+    else {
+      CmdArgs.push_back(Args.MakeArgString("--debug-prefix-map"));
+      CmdArgs.push_back(Args.MakeArgString(Map));
+    }
+    A->claim();
+  }
+
   Args.AddAllArgs(CmdArgs, options::OPT_I);
   Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);
 
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 1053a1a609789..d1f570e75a9bd 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -343,9 +343,8 @@ void HIPToolChain::addClangTargetOptions(
     else
       WaveFrontSizeBC = "oclc_wavefrontsize64_off.amdgcn.bc";
 
-    BCLibs.append({"hip.amdgcn.bc", "opencl.amdgcn.bc", "ocml.amdgcn.bc",
-                   "ockl.amdgcn.bc", "oclc_finite_only_off.amdgcn.bc",
-                   FlushDenormalControlBC,
+    BCLibs.append({"hip.amdgcn.bc", "ocml.amdgcn.bc", "ockl.amdgcn.bc",
+                   "oclc_finite_only_off.amdgcn.bc", FlushDenormalControlBC,
                    "oclc_correctly_rounded_sqrt_on.amdgcn.bc",
                    "oclc_unsafe_math_off.amdgcn.bc", ISAVerBC,
                    WaveFrontSizeBC});
diff --git a/clang/lib/Driver/ToolChains/InterfaceStubs.cpp b/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
index f441f4787097b..8f947e79bd1f1 100644
--- a/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
+++ b/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
@@ -46,6 +46,8 @@ void Merger::ConstructJob(Compilation &C, const JobAction &JA,
   // Here we append the input files. If the input files are object files, then
   // we look for .ifs files present in the same location as the object files.
   for (const auto &Input : Inputs) {
+    if (!Input.isFilename())
+      continue;
     SmallString<128> InputFilename(Input.getFilename());
     if (Input.getType() == types::TY_Object)
       llvm::sys::path::replace_extension(InputFilename, ".ifs");
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index a744b7d632397..2d902cdf7ae65 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -245,7 +245,7 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
                          .str());
   }
 
-  Distro Distro(D.getVFS());
+  Distro Distro(D.getVFS(), Triple);
 
   if (Distro.IsAlpineLinux() || Triple.isAndroid()) {
     ExtraOpts.push_back("-z");
@@ -516,7 +516,7 @@ std::string Linux::getDynamicLinker(const ArgList &Args) const {
   const llvm::Triple::ArchType Arch = getArch();
   const llvm::Triple &Triple = getTriple();
 
-  const Distro Distro(getDriver().getVFS());
+  const Distro Distro(getDriver().getVFS(), Triple);
 
   if (Triple.isAndroid())
     return Triple.isArch64Bit() ? "/system/bin/linker64" : "/system/bin/linker";
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index 55b82592c09fd..1bb7c35d0c522 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -92,10 +92,10 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   C.addCommand(std::make_unique<Command>(JA, *this, Linker, CmdArgs, Inputs));
 
-  // When optimizing, if wasm-opt is in the PATH, run wasm-opt.
+  // When optimizing, if wasm-opt is available, run it.
   if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
-    if (llvm::ErrorOr<std::string> WasmOptPath =
-           llvm::sys::findProgramByName("wasm-opt")) {
+    auto WasmOptPath = getToolChain().GetProgramPath("wasm-opt");
+    if (WasmOptPath != "wasm-opt") {
       StringRef OOpt = "s";
       if (A->getOption().matches(options::OPT_O4) ||
           A->getOption().matches(options::OPT_Ofast))
@@ -106,7 +106,7 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         OOpt = A->getValue();
 
       if (OOpt != "0") {
-        const char *WasmOpt = Args.MakeArgString(*WasmOptPath);
+        const char *WasmOpt = Args.MakeArgString(WasmOptPath);
         ArgStringList CmdArgs;
         CmdArgs.push_back(Output.getFilename());
         CmdArgs.push_back(Args.MakeArgString(llvm::Twine("-O") + OOpt));
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 4e42bab561823..f12bca48c630e 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -537,6 +537,8 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("SpacesBeforeTrailingComments",
                    Style.SpacesBeforeTrailingComments);
     IO.mapOptional("SpacesInAngles", Style.SpacesInAngles);
+    IO.mapOptional("SpacesInConditionalStatement",
+                   Style.SpacesInConditionalStatement);
     IO.mapOptional("SpacesInContainerLiterals",
                    Style.SpacesInContainerLiterals);
     IO.mapOptional("SpacesInCStyleCastParentheses",
@@ -817,6 +819,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.SpaceBeforeCpp11BracedList = false;
   LLVMStyle.SpaceBeforeSquareBrackets = false;
   LLVMStyle.SpacesInAngles = false;
+  LLVMStyle.SpacesInConditionalStatement = false;
 
   LLVMStyle.PenaltyBreakAssignment = prec::Assignment;
   LLVMStyle.PenaltyBreakComment = 300;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 9fe7fdc9ce937..d5d394e61926a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1829,7 +1829,8 @@ class AnnotatingParser {
     // Use heuristics to recognize unary operators.
     if (PrevToken->isOneOf(tok::equal, tok::l_paren, tok::comma, tok::l_square,
                            tok::question, tok::colon, tok::kw_return,
-                           tok::kw_case, tok::at, tok::l_brace, tok::kw_throw))
+                           tok::kw_case, tok::at, tok::l_brace, tok::kw_throw,
+                           tok::kw_co_return, tok::kw_co_yield))
       return TT_UnaryOperator;
 
     // There can't be two consecutive binary operators.
@@ -2591,6 +2592,13 @@ bool TokenAnnotator::spaceRequiredBeforeParens(const FormatToken &Right) const {
           Right.ParameterCount > 0);
 }
 
+/// Returns \c true if the token is followed by a boolean condition, \c false
+/// otherwise.
+static bool isKeywordWithCondition(const FormatToken &Tok) {
+  return Tok.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch,
+                     tok::kw_constexpr);
+}
+
 bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
                                           const FormatToken &Left,
                                           const FormatToken &Right) {
@@ -2609,6 +2617,15 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line,
       (Left.is(tok::l_brace) && Left.BlockKind != BK_Block &&
        Right.is(tok::r_brace) && Right.BlockKind != BK_Block))
     return Style.SpaceInEmptyParentheses;
+  if (Style.SpacesInConditionalStatement) {
+    if (Left.is(tok::l_paren) && Left.Previous &&
+        isKeywordWithCondition(*Left.Previous))
+      return true;
+    if (Right.is(tok::r_paren) && Right.MatchingParen &&
+        Right.MatchingParen->Previous &&
+        isKeywordWithCondition(*Right.MatchingParen->Previous))
+      return true;
+  }
   if (Left.is(tok::l_paren) || Right.is(tok::r_paren))
     return (Right.is(TT_CastRParen) ||
             (Left.MatchingParen && Left.MatchingParen->is(TT_CastRParen)))
@@ -3043,7 +3060,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     // The identifier might actually be a macro name such as ALWAYS_INLINE. If
     // this turns out to be too lenient, add analysis of the identifier itself.
     return Right.WhitespaceRange.getBegin() != Right.WhitespaceRange.getEnd();
-  if (Right.is(tok::coloncolon) && !Left.isOneOf(tok::l_brace, tok::comment))
+  if (Right.is(tok::coloncolon) &&
+      !Left.isOneOf(tok::l_brace, tok::comment, tok::l_paren))
     return (Left.is(TT_TemplateOpener) &&
             Style.Standard < FormatStyle::LS_Cpp11) ||
            !(Left.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 181bbc6440fae..b4db2fa3c4022 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2876,8 +2876,6 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
       getLastArgIntValue(Args, OPT_fconstexpr_steps, 1048576, Diags);
   Opts.EnableNewConstInterp =
       Args.hasArg(OPT_fexperimental_new_constant_interpreter);
-  Opts.ForceNewConstInterp =
-      Args.hasArg(OPT_fforce_experimental_new_constant_interpreter);
   Opts.BracketDepth = getLastArgIntValue(Args, OPT_fbracket_depth, 256, Diags);
   Opts.DelayedTemplateParsing = Args.hasArg(OPT_fdelayed_template_parsing);
   Opts.NumLargeByValueCopy =
@@ -3173,6 +3171,34 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
       Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
   }
 
+  LangOptions::FPRoundingModeKind FPRM = LangOptions::FPR_ToNearest;
+  if (Args.hasArg(OPT_frounding_math)) {
+    FPRM = LangOptions::FPR_Dynamic;
+  }
+  Opts.setFPRoundingMode(FPRM);
+
+  if (Args.hasArg(OPT_ftrapping_math)) {
+    Opts.setFPExceptionMode(LangOptions::FPE_Strict);
+  }
+
+  if (Args.hasArg(OPT_fno_trapping_math)) {
+    Opts.setFPExceptionMode(LangOptions::FPE_Ignore);
+  }
+
+  LangOptions::FPExceptionModeKind FPEB = LangOptions::FPE_Ignore;
+  if (Arg *A = Args.getLastArg(OPT_ffp_exception_behavior_EQ)) {
+    StringRef Val = A->getValue();
+    if (Val.equals("ignore"))
+      FPEB = LangOptions::FPE_Ignore;
+    else if (Val.equals("maytrap"))
+      FPEB = LangOptions::FPE_MayTrap;
+    else if (Val.equals("strict"))
+      FPEB = LangOptions::FPE_Strict;
+    else
+      Diags.Report(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
+  }
+  Opts.setFPExceptionMode(FPEB);
+
   Opts.RetainCommentsFromSystemHeaders =
       Args.hasArg(OPT_fretain_comments_from_system_headers);
 
@@ -3341,6 +3367,9 @@ static void ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args,
   for (const auto *A : Args.filtered(OPT_error_on_deserialized_pch_decl))
     Opts.DeserializedPCHDeclsToErrorOn.insert(A->getValue());
 
+  for (const auto &A : Args.getAllArgValues(OPT_fmacro_prefix_map_EQ))
+    Opts.MacroPrefixMap.insert(StringRef(A).split('='));
+
   if (const Arg *A = Args.getLastArg(OPT_preamble_bytes_EQ)) {
     StringRef Value(A->getValue());
     size_t Comma = Value.find(',');
diff --git a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index ab62b633cda38..18c4814bbd5cc 100644
--- a/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -26,7 +26,8 @@ using namespace llvm::opt;
 
 std::unique_ptr<CompilerInvocation> clang::createInvocationFromCommandLine(
     ArrayRef<const char *> ArgList, IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, bool ShouldRecoverOnErorrs) {
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS, bool ShouldRecoverOnErorrs,
+    std::vector<std::string> *CC1Args) {
   if (!Diags.get()) {
     // No diagnostics engine was provided, so create our own diagnostics object
     // with the default options.
@@ -89,6 +90,8 @@ std::unique_ptr<CompilerInvocation> clang::createInvocationFromCommandLine(
   }
 
   const ArgStringList &CCArgs = Cmd.getArguments();
+  if (CC1Args)
+    *CC1Args = {CCArgs.begin(), CCArgs.end()};
   auto CI = std::make_unique<CompilerInvocation>();
   if (!CompilerInvocation::CreateFromArgs(*CI, CCArgs, *Diags) &&
       !ShouldRecoverOnErorrs)
diff --git a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
index 025e6eb1508fc..3b0a5668af940 100644
--- a/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
+++ b/clang/lib/Frontend/InterfaceStubFunctionsConsumer.cpp
@@ -195,6 +195,10 @@ class InterfaceStubFunctionsConsumer : public ASTConsumer {
     case Decl::Kind::TemplateTemplateParm:
     case Decl::Kind::ClassTemplatePartialSpecialization:
     case Decl::Kind::IndirectField:
+    case Decl::Kind::ConstructorUsingShadow:
+    case Decl::Kind::CXXDeductionGuide:
+    case Decl::Kind::NamespaceAlias:
+    case Decl::Kind::UnresolvedUsingTypename:
       return true;
     case Decl::Kind::Var: {
       // Bail on any VarDecl that either has no named symbol.
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index ced32c670288e..0e5a8e504dc59 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -535,21 +535,15 @@ PrecompiledPreamble::TempPCHFile::CreateNewPreamblePCHFile() {
   // FIXME: This is a hack so that we can override the preamble file during
   // crash-recovery testing, which is the only case where the preamble files
   // are not necessarily cleaned up.
-  const char *TmpFile = ::getenv("CINDEXTEST_PREAMBLE_FILE");
-  if (TmpFile)
-    return TempPCHFile::createFromCustomPath(TmpFile);
-  return TempPCHFile::createInSystemTempDir("preamble", "pch");
-}
+  if (const char *TmpFile = ::getenv("CINDEXTEST_PREAMBLE_FILE"))
+    return TempPCHFile(TmpFile);
 
-llvm::ErrorOr<PrecompiledPreamble::TempPCHFile>
-PrecompiledPreamble::TempPCHFile::createInSystemTempDir(const Twine &Prefix,
-                                                        StringRef Suffix) {
   llvm::SmallString<64> File;
   // Using a version of createTemporaryFile with a file descriptor guarantees
   // that we would never get a race condition in a multi-threaded setting
   // (i.e., multiple threads getting the same temporary path).
   int FD;
-  auto EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, FD, File);
+  auto EC = llvm::sys::fs::createTemporaryFile("preamble", "pch", FD, File);
   if (EC)
     return EC;
   // We only needed to make sure the file exists, close the file right away.
@@ -557,11 +551,6 @@ PrecompiledPreamble::TempPCHFile::createInSystemTempDir(const Twine &Prefix,
   return TempPCHFile(std::move(File).str());
 }
 
-llvm::ErrorOr<PrecompiledPreamble::TempPCHFile>
-PrecompiledPreamble::TempPCHFile::createFromCustomPath(const Twine &Path) {
-  return TempPCHFile(Path.str());
-}
-
 PrecompiledPreamble::TempPCHFile::TempPCHFile(std::string FilePath)
     : FilePath(std::move(FilePath)) {
   TemporaryFiles::getInstance().addFile(*this->FilePath);
diff --git a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
index f063ed711c44c..029bfe1cd6008 100644
--- a/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
+++ b/clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
@@ -763,12 +763,13 @@ bool Minimizer::lexEndif(const char *&First, const char *const End) {
   if (top() == pp_else)
     popToken();
 
-  // Strip out "#elif" if they're empty.
-  while (top() == pp_elif)
-    popToken();
-
-  // If "#if" is empty, strip it and skip the "#endif".
-  if (top() == pp_if || top() == pp_ifdef || top() == pp_ifndef) {
+  // If "#ifdef" is empty, strip it and skip the "#endif".
+  //
+  // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
+  // we can skip empty `#if` and `#elif` blocks as well after scanning for a
+  // literal __has_include in the condition.  Even without that rule we could
+  // drop the tokens if we scan for identifiers in the condition and find none.
+  if (top() == pp_ifdef || top() == pp_ifndef) {
     popToken();
     skipLine(First, End);
     return false;
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 43236c2ef8caa..6f470cae4929d 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -29,6 +29,7 @@
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorLexer.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -1450,6 +1451,17 @@ static bool isTargetEnvironment(const TargetInfo &TI,
   return TI.getTriple().getEnvironment() == Env.getEnvironment();
 }
 
+static void remapMacroPath(
+    SmallString<256> &Path,
+    const std::map<std::string, std::string, std::greater<std::string>>
+        &MacroPrefixMap) {
+  for (const auto &Entry : MacroPrefixMap)
+    if (Path.startswith(Entry.first)) {
+      Path = (Twine(Entry.second) + Path.substr(Entry.first.size())).str();
+      break;
+    }
+}
+
 /// ExpandBuiltinMacro - If an identifier token is read that is to be expanded
 /// as a builtin macro, handle it and return the next token as 'Tok'.
 void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
@@ -1516,7 +1528,7 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
     }
 
     // Escape this filename.  Turn '\' -> '\\' '"' -> '\"'
-    SmallString<128> FN;
+    SmallString<256> FN;
     if (PLoc.isValid()) {
       // __FILE_NAME__ is a Clang-specific extension that expands to the
       // the last part of __FILE__.
@@ -1532,6 +1544,7 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
         FN += PLoc.getFilename();
       }
       Lexer::Stringify(FN);
+      remapMacroPath(FN, PPOpts->MacroPrefixMap);
       OS << '"' << FN << '"';
     }
     Tok.setKind(tok::string_literal);
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index c6ffbfc968d07..fe409327bfb4b 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -3367,7 +3367,7 @@ void Parser::ParseCXXMemberSpecification(SourceLocation RecordLoc,
 
     // We've finished parsing everything, including default argument
     // initializers.
-    Actions.ActOnFinishCXXNonNestedClass(TagDecl);
+    Actions.ActOnFinishCXXNonNestedClass();
   }
 
   if (TagDecl)
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 77eed54376098..7dfe71fb9ebcc 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1352,6 +1352,13 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     // Parse attribute-specifier[opt].
     MaybeParseCXX11Attributes(Attr, &DeclEndLoc);
 
+    // Parse OpenCL addr space attribute.
+    if (Tok.isOneOf(tok::kw___private, tok::kw___global, tok::kw___local,
+                    tok::kw___constant, tok::kw___generic)) {
+      ParseOpenCLQualifiers(DS.getAttributes());
+      ConsumeToken();
+    }
+
     SourceLocation FunLocalRangeEnd = DeclEndLoc;
 
     // Parse trailing-return-type[opt].
@@ -1380,10 +1387,12 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       NoexceptExpr.isUsable() ? NoexceptExpr.get() : nullptr,
                       /*ExceptionSpecTokens*/ nullptr,
                       /*DeclsInPrototype=*/None, LParenLoc, FunLocalRangeEnd, D,
-                      TrailingReturnType),
+                      TrailingReturnType, &DS),
                   std::move(Attr), DeclEndLoc);
   } else if (Tok.isOneOf(tok::kw_mutable, tok::arrow, tok::kw___attribute,
-                         tok::kw_constexpr, tok::kw_consteval) ||
+                         tok::kw_constexpr, tok::kw_consteval,
+                         tok::kw___private, tok::kw___global, tok::kw___local,
+                         tok::kw___constant, tok::kw___generic) ||
              (Tok.is(tok::l_square) && NextToken().is(tok::l_square))) {
     // It's common to forget that one needs '()' before 'mutable', an attribute
     // specifier, or the result type. Deal with this.
@@ -1392,6 +1401,11 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     case tok::kw_mutable: TokKind = 0; break;
     case tok::arrow: TokKind = 1; break;
     case tok::kw___attribute:
+    case tok::kw___private:
+    case tok::kw___global:
+    case tok::kw___local:
+    case tok::kw___constant:
+    case tok::kw___generic:
     case tok::l_square: TokKind = 2; break;
     case tok::kw_constexpr: TokKind = 3; break;
     case tok::kw_consteval: TokKind = 4; break;
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index cb2710ab8c67c..c111682024772 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -1192,6 +1192,59 @@ bool Parser::ParseParenExprOrCondition(StmtResult *InitStmt,
   return false;
 }
 
+namespace {
+
+enum MisleadingStatementKind { MSK_if, MSK_else, MSK_for, MSK_while };
+
+struct MisleadingIndentationChecker {
+  Parser &P;
+  SourceLocation StmtLoc;
+  SourceLocation PrevLoc;
+  unsigned NumDirectives;
+  MisleadingStatementKind Kind;
+  bool NeedsChecking;
+  bool ShouldSkip;
+  MisleadingIndentationChecker(Parser &P, MisleadingStatementKind K,
+                               SourceLocation SL)
+      : P(P), StmtLoc(SL), PrevLoc(P.getCurToken().getLocation()),
+        NumDirectives(P.getPreprocessor().getNumDirectives()), Kind(K),
+        NeedsChecking(true), ShouldSkip(P.getCurToken().is(tok::l_brace)) {
+    if (!P.MisleadingIndentationElseLoc.isInvalid()) {
+      StmtLoc = P.MisleadingIndentationElseLoc;
+      P.MisleadingIndentationElseLoc = SourceLocation();
+    }
+    if (Kind == MSK_else && !ShouldSkip)
+      P.MisleadingIndentationElseLoc = SL;
+  }
+  void Check() {
+    NeedsChecking = false;
+    Token Tok = P.getCurToken();
+    if (ShouldSkip || NumDirectives != P.getPreprocessor().getNumDirectives() ||
+        Tok.isOneOf(tok::semi, tok::r_brace) || Tok.isAnnotation() ||
+        Tok.getLocation().isMacroID() || PrevLoc.isMacroID() ||
+        StmtLoc.isMacroID() ||
+        (Kind == MSK_else && P.MisleadingIndentationElseLoc.isInvalid())) {
+      P.MisleadingIndentationElseLoc = SourceLocation();
+      return;
+    }
+
+    SourceManager &SM = P.getPreprocessor().getSourceManager();
+    unsigned PrevColNum = SM.getSpellingColumnNumber(PrevLoc);
+    unsigned CurColNum = SM.getSpellingColumnNumber(Tok.getLocation());
+    unsigned StmtColNum = SM.getSpellingColumnNumber(StmtLoc);
+
+    if (PrevColNum != 0 && CurColNum != 0 && StmtColNum != 0 &&
+        ((PrevColNum > StmtColNum && PrevColNum == CurColNum) ||
+         !Tok.isAtStartOfLine()) && SM.getPresumedLineNumber(StmtLoc) !=
+          SM.getPresumedLineNumber(Tok.getLocation())) {
+      P.Diag(Tok.getLocation(), diag::warn_misleading_indentation)
+          << Kind;
+      P.Diag(StmtLoc, diag::note_previous_statement);
+    }
+  }
+};
+
+}
 
 /// ParseIfStatement
 ///       if-statement: [C99 6.8.4.1]
@@ -1266,6 +1319,8 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) {
   //
   ParseScope InnerScope(this, Scope::DeclScope, C99orCXX, Tok.is(tok::l_brace));
 
+  MisleadingIndentationChecker MIChecker(*this, MSK_if, IfLoc);
+
   // Read the 'then' stmt.
   SourceLocation ThenStmtLoc = Tok.getLocation();
 
@@ -1279,6 +1334,9 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) {
     ThenStmt = ParseStatement(&InnerStatementTrailingElseLoc);
   }
 
+  if (Tok.isNot(tok::kw_else))
+    MIChecker.Check();
+
   // Pop the 'if' scope if needed.
   InnerScope.Exit();
 
@@ -1306,12 +1364,17 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) {
     ParseScope InnerScope(this, Scope::DeclScope, C99orCXX,
                           Tok.is(tok::l_brace));
 
+    MisleadingIndentationChecker MIChecker(*this, MSK_else, ElseLoc);
+
     EnterExpressionEvaluationContext PotentiallyDiscarded(
         Actions, Sema::ExpressionEvaluationContext::DiscardedStatement, nullptr,
         Sema::ExpressionEvaluationContextRecord::EK_Other,
         /*ShouldEnter=*/ConstexprCondition && *ConstexprCondition);
     ElseStmt = ParseStatement();
 
+    if (ElseStmt.isUsable())
+      MIChecker.Check();
+
     // Pop the 'else' scope if needed.
     InnerScope.Exit();
   } else if (Tok.is(tok::code_completion)) {
@@ -1485,9 +1548,13 @@ StmtResult Parser::ParseWhileStatement(SourceLocation *TrailingElseLoc) {
   //
   ParseScope InnerScope(this, Scope::DeclScope, C99orCXX, Tok.is(tok::l_brace));
 
+  MisleadingIndentationChecker MIChecker(*this, MSK_while, WhileLoc);
+
   // Read the body statement.
   StmtResult Body(ParseStatement(TrailingElseLoc));
 
+  if (Body.isUsable())
+    MIChecker.Check();
   // Pop the body scope if needed.
   InnerScope.Exit();
   WhileScope.Exit();
@@ -1919,9 +1986,14 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
   if (C99orCXXorObjC)
     getCurScope()->decrementMSManglingNumber();
 
+  MisleadingIndentationChecker MIChecker(*this, MSK_for, ForLoc);
+
   // Read the body statement.
   StmtResult Body(ParseStatement(TrailingElseLoc));
 
+  if (Body.isUsable())
+    MIChecker.Check();
+
   // Pop the body scope if needed.
   InnerScope.Exit();
 
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index d7327eebd95c2..418729a4b2658 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -2067,9 +2067,21 @@ Parser::TPResult Parser::TryParseFunctionDeclarator() {
 ///
 Parser::TPResult Parser::TryParseBracketDeclarator() {
   ConsumeBracket();
-  if (!SkipUntil(tok::r_square, StopAtSemi))
+
+  // A constant-expression cannot begin with a '{', but the
+  // expr-or-braced-init-list of a postfix-expression can.
+  if (Tok.is(tok::l_brace))
+    return TPResult::False;
+
+  if (!SkipUntil(tok::r_square, tok::comma, StopAtSemi | StopBeforeMatch))
     return TPResult::Error;
 
+  // If we hit a comma before the ']', this is not a constant-expression,
+  // but might still be the expr-or-braced-init-list of a postfix-expression.
+  if (Tok.isNot(tok::r_square))
+    return TPResult::False;
+
+  ConsumeBracket();
   return TPResult::Ambiguous;
 }
 
diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index 0bd4c51a04c2a..353e0c1d8c8d2 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -274,14 +274,21 @@ def Event             : Type<"Event", QualType<"OCLEventTy">>;
 def VecAndScalar: IntList<"VecAndScalar", [1, 2, 3, 4, 8, 16]>;
 def VecNoScalar : IntList<"VecNoScalar", [2, 3, 4, 8, 16]>;
 def Vec1        : IntList<"Vec1", [1]>;
+def Vec2        : IntList<"Vec2", [2]>;
+def Vec4        : IntList<"Vec4", [4]>;
+def Vec8        : IntList<"Vec8", [8]>;
+def Vec16       : IntList<"Vec16", [16]>;
 def Vec1234     : IntList<"Vec1234", [1, 2, 3, 4]>;
 
 // Type lists.
-def TLAll   : TypeList<"TLAll", [Char, UChar, Short, UShort, Int, UInt, Long, ULong, Float, Double, Half]>;
+def TLAll         : TypeList<"TLAll", [Char,  UChar, Short,  UShort, Int,  UInt, Long,  ULong, Float, Double, Half]>;
+def TLAllUnsigned : TypeList<"TLAllUnsigned", [UChar, UChar, UShort, UShort, UInt, UInt, ULong, ULong, UInt,  ULong,  UShort]>;
 def TLFloat : TypeList<"TLFloat", [Float, Double, Half]>;
 def TLSignedInts   : TypeList<"TLSignedInts", [Char, Short, Int, Long]>;
 def TLUnsignedInts : TypeList<"TLUnsignedInts", [UChar, UShort, UInt, ULong]>;
 
+def TLIntLongFloats : TypeList<"TLIntLongFloats", [Int, UInt, Long, ULong, Float, Double, Half]>;
+
 // All unsigned integer types twice, to facilitate unsigned return types for e.g.
 // uchar abs(char) and
 // uchar abs(uchar).
@@ -306,6 +313,8 @@ def SGenTypeN              : GenericType<"SGenTypeN", TLSignedInts, VecAndScalar
 def UGenTypeN              : GenericType<"UGenTypeN", TLUnsignedInts, VecAndScalar>;
 // Float
 def FGenTypeN              : GenericType<"FGenTypeN", TLFloat, VecAndScalar>;
+// (u)int, (u)long, and all floats
+def IntLongFloatGenType1   : GenericType<"IntLongFloatGenType1", TLIntLongFloats, Vec1>;
 
 // GenType definitions for every single base type (e.g. fp32 only).
 // Names are like: GenTypeFloatVecAndScalar.
@@ -867,6 +876,31 @@ foreach Type = [Int, UInt] in {
   }
 }
 
+//--------------------------------------------------------------------
+// OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions
+// --- Table 19 ---
+foreach name = ["shuffle"] in {
+  foreach VSize1 = [Vec2, Vec4, Vec8, Vec16] in {
+    foreach VSize2 = [Vec2, Vec4, Vec8, Vec16] in {
+      def : Builtin<name, [GenericType<"TLAll" # VSize1.Name, TLAll, VSize1>,
+                           GenericType<"TLAll" # VSize2.Name, TLAll, VSize2>,
+                           GenericType<"TLAllUnsigned" # VSize1.Name, TLAllUnsigned, VSize1>],
+                          Attr.Const>;
+    }
+  }
+}
+foreach name = ["shuffle2"] in {
+  foreach VSize1 = [Vec2, Vec4, Vec8, Vec16] in {
+    foreach VSize2 = [Vec2, Vec4, Vec8, Vec16] in {
+      def : Builtin<name, [GenericType<"TLAll" # VSize1.Name, TLAll, VSize1>,
+                           GenericType<"TLAll" # VSize2.Name, TLAll, VSize2>,
+                           GenericType<"TLAll" # VSize2.Name, TLAll, VSize2>,
+                           GenericType<"TLAllUnsigned" # VSize1.Name, TLAllUnsigned, VSize1>],
+                          Attr.Const>;
+    }
+  }
+}
+
 //--------------------------------------------------------------------
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14: Image Read and Write Functions
 // OpenCL Extension v2.0 s5.1.8 and s6.1.8: Image Read and Write Functions
@@ -1020,6 +1054,27 @@ foreach aQual = ["WO", "RW"] in {
 }
 
 
+//--------------------------------------------------------------------
+// OpenCL v2.0 s6.13.15 - Work-group Functions
+// --- Table 26 ---
+let MinVersion = CL20 in {
+  foreach name = ["work_group_all", "work_group_any"] in {
+    def : Builtin<name, [Int, Int], Attr.Convergent>;
+  }
+  foreach name = ["work_group_broadcast"] in {
+    def : Builtin<name, [IntLongFloatGenType1, IntLongFloatGenType1, Size], Attr.Convergent>;
+    def : Builtin<name, [IntLongFloatGenType1, IntLongFloatGenType1, Size, Size], Attr.Convergent>;
+    def : Builtin<name, [IntLongFloatGenType1, IntLongFloatGenType1, Size, Size, Size], Attr.Convergent>;
+  }
+  foreach op = ["add", "min", "max"] in {
+    foreach name = ["work_group_reduce_", "work_group_scan_exclusive_",
+                    "work_group_scan_inclusive_"] in {
+      def : Builtin<name # op, [IntLongFloatGenType1, IntLongFloatGenType1], Attr.Convergent>;
+    }
+  }
+}
+
+
 // OpenCL v2.0 s9.17.3: Additions to section 6.13.1: Work-Item Functions
 let MinVersion = CL20 in {
   let Extension = "cl_khr_subgroups" in {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index a371b7b793ef3..fea48cfced166 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1311,6 +1311,12 @@ NamedDecl *Sema::getCurFunctionOrMethodDecl() {
   return nullptr;
 }
 
+LangAS Sema::getDefaultCXXMethodAddrSpace() const {
+  if (getLangOpts().OpenCL)
+    return LangAS::opencl_generic;
+  return LangAS::Default;
+}
+
 void Sema::EmitCurrentDiagnostic(unsigned DiagID) {
   // FIXME: It doesn't make sense to me that DiagID is an incoming argument here
   // and yet we also use the current diag ID on the DiagnosticsEngine. This has
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index efd2ad22dcf12..dca54fddb1193 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3061,8 +3061,37 @@ bool Sema::CheckHexagonBuiltinFunctionCall(unsigned BuiltinID,
          CheckHexagonBuiltinArgument(BuiltinID, TheCall);
 }
 
+bool Sema::CheckMipsBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+  return CheckMipsBuiltinCpu(BuiltinID, TheCall) ||
+         CheckMipsBuiltinArgument(BuiltinID, TheCall);
+}
+
+bool Sema::CheckMipsBuiltinCpu(unsigned BuiltinID, CallExpr *TheCall) {
+  const TargetInfo &TI = Context.getTargetInfo();
+
+  if (Mips::BI__builtin_mips_addu_qb <= BuiltinID &&
+      BuiltinID <= Mips::BI__builtin_mips_lwx) {
+    if (!TI.hasFeature("dsp"))
+      return Diag(TheCall->getBeginLoc(), diag::err_mips_builtin_requires_dsp);
+  }
+
+  if (Mips::BI__builtin_mips_absq_s_qb <= BuiltinID &&
+      BuiltinID <= Mips::BI__builtin_mips_subuh_r_qb) {
+    if (!TI.hasFeature("dspr2"))
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_mips_builtin_requires_dspr2);
+  }
+
+  if (Mips::BI__builtin_msa_add_a_b <= BuiltinID &&
+      BuiltinID <= Mips::BI__builtin_msa_xori_b) {
+    if (!TI.hasFeature("msa"))
+      return Diag(TheCall->getBeginLoc(), diag::err_mips_builtin_requires_msa);
+  }
+
+  return false;
+}
 
-// CheckMipsBuiltinFunctionCall - Checks the constant value passed to the
+// CheckMipsBuiltinArgument - Checks the constant value passed to the
 // intrinsic is correct. The switch statement is ordered by DSP, MSA. The
 // ordering for DSP is unspecified. MSA is ordered by the data format used
 // by the underlying instruction i.e., df/m, df/n and then by size.
@@ -3071,7 +3100,7 @@ bool Sema::CheckHexagonBuiltinFunctionCall(unsigned BuiltinID,
 //        definitions from include/clang/Basic/BuiltinsMips.def.
 // FIXME: GCC is strict on signedness for some of these intrinsics, we should
 //        be too.
-bool Sema::CheckMipsBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
+bool Sema::CheckMipsBuiltinArgument(unsigned BuiltinID, CallExpr *TheCall) {
   unsigned i = 0, l = 0, u = 0, m = 0;
   switch (BuiltinID) {
   default: return false;
@@ -12991,7 +13020,8 @@ class SequenceChecker : public EvaluatedExprVisitor<SequenceChecker> {
     //   expression or statement in the body of the function [and thus before
     //   the value computation of its result].
     SequencedSubexpression Sequenced(*this);
-    Base::VisitCallExpr(CE);
+    SemaRef.runWithSufficientStackSpace(CE->getExprLoc(),
+                                        [&] { Base::VisitCallExpr(CE); });
 
     // FIXME: CXXNewExpr and CXXDeleteExpr implicitly call functions.
   }
@@ -14746,6 +14776,8 @@ void Sema::RefersToMemberWithReducedAlignment(
   bool AnyIsPacked = false;
   do {
     QualType BaseType = ME->getBase()->getType();
+    if (BaseType->isDependentType())
+      return;
     if (ME->isArrow())
       BaseType = BaseType->getPointeeType();
     RecordDecl *RD = BaseType->castAs<RecordType>()->getDecl();
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ed5c6f878776b..a0fdf77dba51b 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -6117,6 +6117,41 @@ bool Sema::inferObjCARCLifetime(ValueDecl *decl) {
   return false;
 }
 
+void Sema::deduceOpenCLAddressSpace(ValueDecl *Decl) {
+  if (Decl->getType().getQualifiers().hasAddressSpace())
+    return;
+  if (VarDecl *Var = dyn_cast<VarDecl>(Decl)) {
+    QualType Type = Var->getType();
+    if (Type->isSamplerT() || Type->isVoidType())
+      return;
+    LangAS ImplAS = LangAS::opencl_private;
+    if ((getLangOpts().OpenCLCPlusPlus || getLangOpts().OpenCLVersion >= 200) &&
+        Var->hasGlobalStorage())
+      ImplAS = LangAS::opencl_global;
+    // If the original type from a decayed type is an array type and that array
+    // type has no address space yet, deduce it now.
+    if (auto DT = dyn_cast<DecayedType>(Type)) {
+      auto OrigTy = DT->getOriginalType();
+      if (!OrigTy.getQualifiers().hasAddressSpace() && OrigTy->isArrayType()) {
+        // Add the address space to the original array type and then propagate
+        // that to the element type through `getAsArrayType`.
+        OrigTy = Context.getAddrSpaceQualType(OrigTy, ImplAS);
+        OrigTy = QualType(Context.getAsArrayType(OrigTy), 0);
+        // Re-generate the decayed type.
+        Type = Context.getDecayedType(OrigTy);
+      }
+    }
+    Type = Context.getAddrSpaceQualType(Type, ImplAS);
+    // Apply any qualifiers (including address space) from the array type to
+    // the element type. This implements C99 6.7.3p8: "If the specification of
+    // an array type includes any type qualifiers, the element type is so
+    // qualified, not the array type."
+    if (Type->isArrayType())
+      Type = QualType(Context.getAsArrayType(Type), 0);
+    Decl->setType(Type);
+  }
+}
+
 static void checkAttributesAfterMerging(Sema &S, NamedDecl &ND) {
   // Ensure that an auto decl is deduced otherwise the checks below might cache
   // the wrong linkage.
@@ -6474,27 +6509,79 @@ static bool isDeclExternC(const Decl *D) {
 
   llvm_unreachable("Unknown type of decl!");
 }
+/// Returns true if there hasn't been any invalid type diagnosed.
+static bool diagnoseOpenCLTypes(Scope *S, Sema &Se, Declarator &D,
+                                DeclContext *DC, QualType R) {
+  // OpenCL v2.0 s6.9.b - Image type can only be used as a function argument.
+  // OpenCL v2.0 s6.13.16.1 - Pipe type can only be used as a function
+  // argument.
+  if (R->isImageType() || R->isPipeType()) {
+    Se.Diag(D.getIdentifierLoc(),
+            diag::err_opencl_type_can_only_be_used_as_function_parameter)
+        << R;
+    D.setInvalidType();
+    return false;
+  }
 
-NamedDecl *Sema::ActOnVariableDeclarator(
-    Scope *S, Declarator &D, DeclContext *DC, TypeSourceInfo *TInfo,
-    LookupResult &Previous, MultiTemplateParamsArg TemplateParamLists,
-    bool &AddToScope, ArrayRef<BindingDecl *> Bindings) {
-  QualType R = TInfo->getType();
-  DeclarationName Name = GetNameForDeclarator(D).getName();
+  // OpenCL v1.2 s6.9.r:
+  // The event type cannot be used to declare a program scope variable.
+  // OpenCL v2.0 s6.9.q:
+  // The clk_event_t and reserve_id_t types cannot be declared in program
+  // scope.
+  if (NULL == S->getParent()) {
+    if (R->isReserveIDT() || R->isClkEventT() || R->isEventT()) {
+      Se.Diag(D.getIdentifierLoc(),
+              diag::err_invalid_type_for_program_scope_var)
+          << R;
+      D.setInvalidType();
+      return false;
+    }
+  }
 
-  IdentifierInfo *II = Name.getAsIdentifierInfo();
+  // OpenCL v1.0 s6.8.a.3: Pointers to functions are not allowed.
+  QualType NR = R;
+  while (NR->isPointerType()) {
+    if (NR->isFunctionPointerType()) {
+      Se.Diag(D.getIdentifierLoc(), diag::err_opencl_function_pointer);
+      D.setInvalidType();
+      return false;
+    }
+    NR = NR->getPointeeType();
+  }
 
-  if (D.isDecompositionDeclarator()) {
-    // Take the name of the first declarator as our name for diagnostic
-    // purposes.
-    auto &Decomp = D.getDecompositionDeclarator();
-    if (!Decomp.bindings().empty()) {
-      II = Decomp.bindings()[0].Name;
-      Name = II;
+  if (!Se.getOpenCLOptions().isEnabled("cl_khr_fp16")) {
+    // OpenCL v1.2 s6.1.1.1: reject declaring variables of the half and
+    // half array type (unless the cl_khr_fp16 extension is enabled).
+    if (Se.Context.getBaseElementType(R)->isHalfType()) {
+      Se.Diag(D.getIdentifierLoc(), diag::err_opencl_half_declaration) << R;
+      D.setInvalidType();
+      return false;
     }
-  } else if (!II) {
-    Diag(D.getIdentifierLoc(), diag::err_bad_variable_name) << Name;
-    return nullptr;
+  }
+
+  // OpenCL v1.2 s6.9.r:
+  // The event type cannot be used with the __local, __constant and __global
+  // address space qualifiers.
+  if (R->isEventT()) {
+    if (R.getAddressSpace() != LangAS::opencl_private) {
+      Se.Diag(D.getBeginLoc(), diag::err_event_t_addr_space_qual);
+      D.setInvalidType();
+      return false;
+    }
+  }
+
+  // C++ for OpenCL does not allow the thread_local storage qualifier.
+  // OpenCL C does not support thread_local either, and
+  // also reject all other thread storage class specifiers.
+  DeclSpec::TSCS TSC = D.getDeclSpec().getThreadStorageClassSpec();
+  if (TSC != TSCS_unspecified) {
+    bool IsCXX = Se.getLangOpts().OpenCLCPlusPlus;
+    Se.Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
+            diag::err_opencl_unknown_type_specifier)
+        << IsCXX << Se.getLangOpts().getOpenCLVersionTuple().getAsString()
+        << DeclSpec::getSpecifierName(TSC) << 1;
+    D.setInvalidType();
+    return false;
   }
 
   if (R->isSamplerT()) {
@@ -6503,7 +6590,8 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     // space qualifiers.
     if (R.getAddressSpace() == LangAS::opencl_local ||
         R.getAddressSpace() == LangAS::opencl_global) {
-      Diag(D.getIdentifierLoc(), diag::err_wrong_sampler_addressspace);
+      Se.Diag(D.getIdentifierLoc(), diag::err_wrong_sampler_addressspace);
+      D.setInvalidType();
     }
 
     // OpenCL v1.2 s6.12.14.1:
@@ -6512,79 +6600,35 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     if (DC->isTranslationUnit() &&
         !(R.getAddressSpace() == LangAS::opencl_constant ||
           R.isConstQualified())) {
-      Diag(D.getIdentifierLoc(), diag::err_opencl_nonconst_global_sampler);
+      Se.Diag(D.getIdentifierLoc(), diag::err_opencl_nonconst_global_sampler);
       D.setInvalidType();
     }
+    if (D.isInvalidType())
+      return false;
   }
+  return true;
+}
 
-  if (getLangOpts().OpenCL) {
-    // OpenCL v2.0 s6.9.b - Image type can only be used as a function argument.
-    // OpenCL v2.0 s6.13.16.1 - Pipe type can only be used as a function
-    // argument.
-    if (R->isImageType() || R->isPipeType()) {
-      Diag(D.getIdentifierLoc(),
-           diag::err_opencl_type_can_only_be_used_as_function_parameter)
-          << R;
-      D.setInvalidType();
-      return nullptr;
-    }
-
-    // OpenCL v1.2 s6.9.r:
-    // The event type cannot be used to declare a program scope variable.
-    // OpenCL v2.0 s6.9.q:
-    // The clk_event_t and reserve_id_t types cannot be declared in program scope.
-    if (NULL == S->getParent()) {
-      if (R->isReserveIDT() || R->isClkEventT() || R->isEventT()) {
-        Diag(D.getIdentifierLoc(),
-             diag::err_invalid_type_for_program_scope_var) << R;
-        D.setInvalidType();
-        return nullptr;
-      }
-    }
-
-    // OpenCL v1.0 s6.8.a.3: Pointers to functions are not allowed.
-    QualType NR = R;
-    while (NR->isPointerType()) {
-      if (NR->isFunctionPointerType()) {
-        Diag(D.getIdentifierLoc(), diag::err_opencl_function_pointer);
-        D.setInvalidType();
-        break;
-      }
-      NR = NR->getPointeeType();
-    }
-
-    if (!getOpenCLOptions().isEnabled("cl_khr_fp16")) {
-      // OpenCL v1.2 s6.1.1.1: reject declaring variables of the half and
-      // half array type (unless the cl_khr_fp16 extension is enabled).
-      if (Context.getBaseElementType(R)->isHalfType()) {
-        Diag(D.getIdentifierLoc(), diag::err_opencl_half_declaration) << R;
-        D.setInvalidType();
-      }
-    }
+NamedDecl *Sema::ActOnVariableDeclarator(
+    Scope *S, Declarator &D, DeclContext *DC, TypeSourceInfo *TInfo,
+    LookupResult &Previous, MultiTemplateParamsArg TemplateParamLists,
+    bool &AddToScope, ArrayRef<BindingDecl *> Bindings) {
+  QualType R = TInfo->getType();
+  DeclarationName Name = GetNameForDeclarator(D).getName();
 
-    // OpenCL v1.2 s6.9.r:
-    // The event type cannot be used with the __local, __constant and __global
-    // address space qualifiers.
-    if (R->isEventT()) {
-      if (R.getAddressSpace() != LangAS::opencl_private) {
-        Diag(D.getBeginLoc(), diag::err_event_t_addr_space_qual);
-        D.setInvalidType();
-      }
-    }
+  IdentifierInfo *II = Name.getAsIdentifierInfo();
 
-    // C++ for OpenCL does not allow the thread_local storage qualifier.
-    // OpenCL C does not support thread_local either, and
-    // also reject all other thread storage class specifiers.
-    DeclSpec::TSCS TSC = D.getDeclSpec().getThreadStorageClassSpec();
-    if (TSC != TSCS_unspecified) {
-      bool IsCXX = getLangOpts().OpenCLCPlusPlus;
-      Diag(D.getDeclSpec().getThreadStorageClassSpecLoc(),
-           diag::err_opencl_unknown_type_specifier)
-          << IsCXX << getLangOpts().getOpenCLVersionTuple().getAsString()
-          << DeclSpec::getSpecifierName(TSC) << 1;
-      D.setInvalidType();
-      return nullptr;
+  if (D.isDecompositionDeclarator()) {
+    // Take the name of the first declarator as our name for diagnostic
+    // purposes.
+    auto &Decomp = D.getDecompositionDeclarator();
+    if (!Decomp.bindings().empty()) {
+      II = Decomp.bindings()[0].Name;
+      Name = II;
     }
+  } else if (!II) {
+    Diag(D.getIdentifierLoc(), diag::err_bad_variable_name) << Name;
+    return nullptr;
   }
 
   DeclSpec::SCS SCSpec = D.getDeclSpec().getStorageClassSpec();
@@ -6950,6 +6994,13 @@ NamedDecl *Sema::ActOnVariableDeclarator(
     }
   }
 
+  if (getLangOpts().OpenCL) {
+
+    deduceOpenCLAddressSpace(NewVD);
+
+    diagnoseOpenCLTypes(S, *this, D, DC, NewVD->getType());
+  }
+
   // Handle attributes prior to checking for duplicates in MergeVarDecl
   ProcessDeclAttributes(S, NewVD, D);
 
@@ -7017,8 +7068,6 @@ NamedDecl *Sema::ActOnVariableDeclarator(
           Diag(E->getExprLoc(), diag::err_asm_invalid_global_var_reg) << Label;
         else if (HasSizeMismatch)
           Diag(E->getExprLoc(), diag::err_asm_register_size_mismatch) << Label;
-        else if (!TI.isRegisterReservedGlobally(Label))
-          Diag(E->getExprLoc(), diag::err_asm_missing_fixed_reg_opt) << Label;
       }
 
       if (!R->isIntegralType(Context) && !R->isPointerType()) {
@@ -11289,6 +11338,9 @@ bool Sema::DeduceVariableDeclarationType(VarDecl *VDecl, bool DirectInit,
   if (getLangOpts().ObjCAutoRefCount && inferObjCARCLifetime(VDecl))
     VDecl->setInvalidDecl();
 
+  if (getLangOpts().OpenCL)
+    deduceOpenCLAddressSpace(VDecl);
+
   // If this is a redeclaration, check that the type we just deduced matches
   // the previously declared type.
   if (VarDecl *Old = VDecl->getPreviousDecl()) {
@@ -13111,6 +13163,10 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D) {
   if (New->hasAttr<BlocksAttr>()) {
     Diag(New->getLocation(), diag::err_block_on_nonlocal);
   }
+
+  if (getLangOpts().OpenCL)
+    deduceOpenCLAddressSpace(New);
+
   return New;
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 8c434be523172..fa6a2e92b8cbb 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11417,10 +11417,9 @@ void Sema::setupImplicitSpecialMemberType(CXXMethodDecl *SpecialMem,
   // Build an exception specification pointing back at this constructor.
   FunctionProtoType::ExtProtoInfo EPI = getImplicitMethodEPI(*this, SpecialMem);
 
-  if (getLangOpts().OpenCLCPlusPlus) {
-    // OpenCL: Implicitly defaulted special member are of the generic address
-    // space.
-    EPI.TypeQuals.addAddressSpace(LangAS::opencl_generic);
+  LangAS AS = getDefaultCXXMethodAddrSpace();
+  if (AS != LangAS::Default) {
+    EPI.TypeQuals.addAddressSpace(AS);
   }
 
   auto QT = Context.getFunctionType(ResultTy, Args, EPI);
@@ -11830,7 +11829,7 @@ void Sema::ActOnFinishCXXMemberDecls() {
   }
 }
 
-void Sema::ActOnFinishCXXNonNestedClass(Decl *D) {
+void Sema::ActOnFinishCXXNonNestedClass() {
   referenceDLLExportedClassMethods();
 
   if (!DelayedDllExportMemberFunctions.empty()) {
@@ -12330,8 +12329,9 @@ CXXMethodDecl *Sema::DeclareImplicitCopyAssignment(CXXRecordDecl *ClassDecl) {
     return nullptr;
 
   QualType ArgType = Context.getTypeDeclType(ClassDecl);
-  if (Context.getLangOpts().OpenCLCPlusPlus)
-    ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);
+  LangAS AS = getDefaultCXXMethodAddrSpace();
+  if (AS != LangAS::Default)
+    ArgType = Context.getAddrSpaceQualType(ArgType, AS);
   QualType RetType = Context.getLValueReferenceType(ArgType);
   bool Const = ClassDecl->implicitCopyAssignmentHasConstParam();
   if (Const)
@@ -12406,8 +12406,7 @@ static void diagnoseDeprecatedCopyOperation(Sema &S, CXXMethodDecl *CopyOp) {
 
   // In Microsoft mode, assignment operations don't affect constructors and
   // vice versa.
-  if (RD->hasUserDeclaredDestructor() &&
-      RD->getDestructor()->isUserProvided()) {
+  if (RD->hasUserDeclaredDestructor()) {
     UserDeclaredOperation = RD->getDestructor();
   } else if (!isa<CXXConstructorDecl>(CopyOp) &&
              RD->hasUserDeclaredCopyConstructor() &&
@@ -12435,9 +12434,10 @@ static void diagnoseDeprecatedCopyOperation(Sema &S, CXXMethodDecl *CopyOp) {
 
   if (UserDeclaredOperation && UserDeclaredOperation->isUserProvided()) {
     S.Diag(UserDeclaredOperation->getLocation(),
-         diag::warn_deprecated_copy_operation)
-      << RD << /*copy assignment*/!isa<CXXConstructorDecl>(CopyOp)
-      << /*destructor*/isa<CXXDestructorDecl>(UserDeclaredOperation);
+           isa<CXXDestructorDecl>(UserDeclaredOperation)
+               ? diag::warn_deprecated_copy_dtor_operation
+               : diag::warn_deprecated_copy_operation)
+        << RD << /*copy assignment*/ !isa<CXXConstructorDecl>(CopyOp);
   }
 }
 
@@ -12656,8 +12656,9 @@ CXXMethodDecl *Sema::DeclareImplicitMoveAssignment(CXXRecordDecl *ClassDecl) {
   // constructor rules.
 
   QualType ArgType = Context.getTypeDeclType(ClassDecl);
-  if (Context.getLangOpts().OpenCLCPlusPlus)
-    ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);
+  LangAS AS = getDefaultCXXMethodAddrSpace();
+  if (AS != LangAS::Default)
+    ArgType = Context.getAddrSpaceQualType(ArgType, AS);
   QualType RetType = Context.getLValueReferenceType(ArgType);
   ArgType = Context.getRValueReferenceType(ArgType);
 
@@ -13034,8 +13035,9 @@ CXXConstructorDecl *Sema::DeclareImplicitCopyConstructor(
   if (Const)
     ArgType = ArgType.withConst();
 
-  if (Context.getLangOpts().OpenCLCPlusPlus)
-    ArgType = Context.getAddrSpaceQualType(ArgType, LangAS::opencl_generic);
+  LangAS AS = getDefaultCXXMethodAddrSpace();
+  if (AS != LangAS::Default)
+    ArgType = Context.getAddrSpaceQualType(ArgType, AS);
 
   ArgType = Context.getLValueReferenceType(ArgType);
 
@@ -13166,8 +13168,9 @@ CXXConstructorDecl *Sema::DeclareImplicitMoveConstructor(
   QualType ClassType = Context.getTypeDeclType(ClassDecl);
 
   QualType ArgType = ClassType;
-  if (Context.getLangOpts().OpenCLCPlusPlus)
-    ArgType = Context.getAddrSpaceQualType(ClassType, LangAS::opencl_generic);
+  LangAS AS = getDefaultCXXMethodAddrSpace();
+  if (AS != LangAS::Default)
+    ArgType = Context.getAddrSpaceQualType(ClassType, AS);
   ArgType = Context.getRValueReferenceType(ArgType);
 
   bool Constexpr = defaultedSpecialMemberIsConstexpr(*this, ClassDecl,
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 48d3e69c5bc7a..f01f03d756696 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -13103,6 +13103,16 @@ ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
   if (ResultTy.isNull() || LHS.isInvalid() || RHS.isInvalid())
     return ExprError();
 
+  if (ResultTy->isRealFloatingType() &&
+      (getLangOpts().getFPRoundingMode() != LangOptions::FPR_ToNearest ||
+       getLangOpts().getFPExceptionMode() != LangOptions::FPE_Ignore))
+    // Mark the current function as usng floating point constrained intrinsics
+    if (FunctionDecl *F = dyn_cast<FunctionDecl>(CurContext))
+{
+      F->setUsesFPIntrin(true);
+      printf("Enclosing function uses fp intrinsics\n");
+}
+
   // Some of the binary operations require promoting operands of half vector to
   // float vectors and truncating the result back to half vector. For now, we do
   // this only when HalfArgsAndReturn is set (that is, when the target is arm or
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index c1812922cc034..ee17f826c7527 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -4105,9 +4105,26 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
             << From->getSourceRange();
     }
 
+    // Defer address space conversion to the third conversion.
+    QualType FromPteeType = From->getType()->getPointeeType();
+    QualType ToPteeType = ToType->getPointeeType();
+    QualType NewToType = ToType;
+    if (!FromPteeType.isNull() && !ToPteeType.isNull() &&
+        FromPteeType.getAddressSpace() != ToPteeType.getAddressSpace()) {
+      NewToType = Context.removeAddrSpaceQualType(ToPteeType);
+      NewToType = Context.getAddrSpaceQualType(NewToType,
+                                               FromPteeType.getAddressSpace());
+      if (ToType->isObjCObjectPointerType())
+        NewToType = Context.getObjCObjectPointerType(NewToType);
+      else if (ToType->isBlockPointerType())
+        NewToType = Context.getBlockPointerType(NewToType);
+      else
+        NewToType = Context.getPointerType(NewToType);
+    }
+
     CastKind Kind;
     CXXCastPath BasePath;
-    if (CheckPointerConversion(From, ToType, Kind, BasePath, CStyle))
+    if (CheckPointerConversion(From, NewToType, Kind, BasePath, CStyle))
       return ExprError();
 
     // Make sure we extend blocks if necessary.
@@ -4118,8 +4135,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
       From = E.get();
     }
     if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers())
-      CheckObjCConversion(SourceRange(), ToType, From, CCK);
-    From = ImpCastExprToType(From, ToType, Kind, VK_RValue, &BasePath, CCK)
+      CheckObjCConversion(SourceRange(), NewToType, From, CCK);
+    From = ImpCastExprToType(From, NewToType, Kind, VK_RValue, &BasePath, CCK)
              .get();
     break;
   }
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index fbc8e8e5d23d0..065fd672a194e 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -6653,6 +6653,7 @@ struct IndirectLocalPathEntry {
     VarInit,
     LValToRVal,
     LifetimeBoundCall,
+    GslReferenceInit,
     GslPointerInit
   } Kind;
   Expr *E;
@@ -6783,12 +6784,24 @@ static bool shouldTrackFirstArgument(const FunctionDecl *FD) {
 
 static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
                                     LocalVisitor Visit) {
-  auto VisitPointerArg = [&](const Decl *D, Expr *Arg) {
+  auto VisitPointerArg = [&](const Decl *D, Expr *Arg, bool Value) {
     // We are not interested in the temporary base objects of gsl Pointers:
     //   Temp().ptr; // Here ptr might not dangle.
     if (isa<MemberExpr>(Arg->IgnoreImpCasts()))
       return;
-    Path.push_back({IndirectLocalPathEntry::GslPointerInit, Arg, D});
+    // Once we initialized a value with a reference, it can no longer dangle.
+    if (!Value) {
+      for (auto It = Path.rbegin(), End = Path.rend(); It != End; ++It) {
+        if (It->Kind == IndirectLocalPathEntry::GslReferenceInit)
+          continue;
+        if (It->Kind == IndirectLocalPathEntry::GslPointerInit)
+          return;
+        break;
+      }
+    }
+    Path.push_back({Value ? IndirectLocalPathEntry::GslPointerInit
+                          : IndirectLocalPathEntry::GslReferenceInit,
+                    Arg, D});
     if (Arg->isGLValue())
       visitLocalsRetainedByReferenceBinding(Path, Arg, RK_ReferenceBinding,
                                             Visit,
@@ -6802,18 +6815,21 @@ static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
   if (auto *MCE = dyn_cast<CXXMemberCallExpr>(Call)) {
     const auto *MD = cast_or_null<CXXMethodDecl>(MCE->getDirectCallee());
     if (MD && shouldTrackImplicitObjectArg(MD))
-      VisitPointerArg(MD, MCE->getImplicitObjectArgument());
+      VisitPointerArg(MD, MCE->getImplicitObjectArgument(),
+                      !MD->getReturnType()->isReferenceType());
     return;
   } else if (auto *OCE = dyn_cast<CXXOperatorCallExpr>(Call)) {
     FunctionDecl *Callee = OCE->getDirectCallee();
     if (Callee && Callee->isCXXInstanceMember() &&
         shouldTrackImplicitObjectArg(cast<CXXMethodDecl>(Callee)))
-      VisitPointerArg(Callee, OCE->getArg(0));
+      VisitPointerArg(Callee, OCE->getArg(0),
+                      !Callee->getReturnType()->isReferenceType());
     return;
   } else if (auto *CE = dyn_cast<CallExpr>(Call)) {
     FunctionDecl *Callee = CE->getDirectCallee();
     if (Callee && shouldTrackFirstArgument(Callee))
-      VisitPointerArg(Callee, CE->getArg(0));
+      VisitPointerArg(Callee, CE->getArg(0),
+                      !Callee->getReturnType()->isReferenceType());
     return;
   }
 
@@ -6821,7 +6837,7 @@ static void handleGslAnnotatedTypes(IndirectLocalPath &Path, Expr *Call,
     const auto *Ctor = CCE->getConstructor();
     const CXXRecordDecl *RD = Ctor->getParent();
     if (CCE->getNumArgs() > 0 && RD->hasAttr<PointerAttr>())
-      VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0]);
+      VisitPointerArg(Ctor->getParamDecl(0), CCE->getArgs()[0], true);
   }
 }
 
@@ -7287,6 +7303,7 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
     case IndirectLocalPathEntry::AddressOf:
     case IndirectLocalPathEntry::LValToRVal:
     case IndirectLocalPathEntry::LifetimeBoundCall:
+    case IndirectLocalPathEntry::GslReferenceInit:
     case IndirectLocalPathEntry::GslPointerInit:
       // These exist primarily to mark the path as not permitting or
       // supporting lifetime extension.
@@ -7309,7 +7326,8 @@ static bool pathOnlyInitializesGslPointer(IndirectLocalPath &Path) {
       continue;
     if (It->Kind == IndirectLocalPathEntry::AddressOf)
       continue;
-    return It->Kind == IndirectLocalPathEntry::GslPointerInit;
+    return It->Kind == IndirectLocalPathEntry::GslPointerInit ||
+           It->Kind == IndirectLocalPathEntry::GslReferenceInit;
   }
   return false;
 }
@@ -7532,6 +7550,7 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity,
 
       case IndirectLocalPathEntry::LifetimeBoundCall:
       case IndirectLocalPathEntry::GslPointerInit:
+      case IndirectLocalPathEntry::GslReferenceInit:
         // FIXME: Consider adding a note for these.
         break;
 
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index c6b19a0b195c5..14b443e9dac08 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -917,6 +917,10 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
         /*IsVariadic=*/false, /*IsCXXMethod=*/true));
     EPI.HasTrailingReturn = true;
     EPI.TypeQuals.addConst();
+    LangAS AS = getDefaultCXXMethodAddrSpace();
+    if (AS != LangAS::Default)
+      EPI.TypeQuals.addAddressSpace(AS);
+
     // C++1y [expr.prim.lambda]:
     //   The lambda return type is 'auto', which is replaced by the
     //   trailing-return type if provided and/or deduced from 'return'
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index df817e6bcef13..d9b6cb6a92153 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -823,7 +823,8 @@ static void InsertOCLBuiltinDeclarationsFromTable(Sema &S, LookupResult &LR,
         NewOpenCLBuiltin->addAttr(ConstAttr::CreateImplicit(Context));
       if (OpenCLBuiltin.IsConv)
         NewOpenCLBuiltin->addAttr(ConvergentAttr::CreateImplicit(Context));
-      if ((GenTypeMaxCnt > 1 || Len > 1) && !S.getLangOpts().OpenCLCPlusPlus)
+
+      if (!S.getLangOpts().OpenCLCPlusPlus)
         NewOpenCLBuiltin->addAttr(OverloadableAttr::CreateImplicit(Context));
 
       LR.addDecl(NewOpenCLBuiltin);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 59178fb671fb0..2523d7edc3e7d 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4538,6 +4538,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_parallel);
+    if (LangOpts.OpenMP >= 50)
+      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_parallel_sections:
     Res = ActOnOpenMPParallelSectionsDirective(ClausesWithImplicit, AStmt,
@@ -4646,6 +4648,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                            EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_taskloop);
+    if (LangOpts.OpenMP >= 50)
+      AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_master_taskloop:
     Res = ActOnOpenMPMasterTaskLoopDirective(
@@ -5443,7 +5447,7 @@ void Sema::markOpenMPDeclareVariantFuncsReferenced(SourceLocation Loc,
          Func->specific_attrs<OMPDeclareVariantAttr>()) {
       // TODO: add checks for active OpenMP context where possible.
       Expr *VariantRef = A->getVariantFuncRef();
-      auto *DRE = dyn_cast<DeclRefExpr>(VariantRef->IgnoreParenImpCasts());
+      auto *DRE = cast<DeclRefExpr>(VariantRef->IgnoreParenImpCasts());
       auto *F = cast<FunctionDecl>(DRE->getDecl());
       if (!F->isDefined() && F->isTemplateInstantiation())
         InstantiateFunctionDefinition(Loc, F->getFirstDecl());
@@ -10642,7 +10646,7 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
 // A return value of OMPD_unknown signifies that the expression should not
 // be captured.
 static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
-    OpenMPDirectiveKind DKind, OpenMPClauseKind CKind,
+    OpenMPDirectiveKind DKind, OpenMPClauseKind CKind, unsigned OpenMPVersion,
     OpenMPDirectiveKind NameModifier = OMPD_unknown) {
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
   switch (CKind) {
@@ -10677,11 +10681,22 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
       if (NameModifier == OMPD_unknown || NameModifier == OMPD_taskloop)
         CaptureRegion = OMPD_parallel;
       break;
+    case OMPD_parallel_for_simd:
+      if (OpenMPVersion <= 45)
+        break;
+      if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
+        CaptureRegion = OMPD_parallel;
+      break;
+    case OMPD_taskloop_simd:
+      if (OpenMPVersion <= 45)
+        break;
+      if (NameModifier == OMPD_unknown || NameModifier == OMPD_simd)
+        CaptureRegion = OMPD_taskloop;
+      break;
     case OMPD_cancel:
     case OMPD_parallel:
     case OMPD_parallel_sections:
     case OMPD_parallel_for:
-    case OMPD_parallel_for_simd:
     case OMPD_target:
     case OMPD_target_simd:
     case OMPD_target_teams:
@@ -10691,7 +10706,6 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
     case OMPD_distribute_parallel_for_simd:
     case OMPD_task:
     case OMPD_taskloop:
-    case OMPD_taskloop_simd:
     case OMPD_master_taskloop:
     case OMPD_master_taskloop_simd:
     case OMPD_target_data:
@@ -11306,8 +11320,8 @@ OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
     ValExpr = Val.get();
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-    CaptureRegion =
-        getOpenMPCaptureRegionForClause(DKind, OMPC_if, NameModifier);
+    CaptureRegion = getOpenMPCaptureRegionForClause(
+        DKind, OMPC_if, LangOpts.OpenMP, NameModifier);
     if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
       ValExpr = MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -11338,7 +11352,8 @@ OMPClause *Sema::ActOnOpenMPFinalClause(Expr *Condition,
     ValExpr = MakeFullExpr(Val.get()).get();
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-    CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_final);
+    CaptureRegion =
+        getOpenMPCaptureRegionForClause(DKind, OMPC_final, LangOpts.OpenMP);
     if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
       ValExpr = MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -11423,7 +11438,8 @@ isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind,
     }
     if (!BuildCapture)
       return true;
-    *CaptureRegion = getOpenMPCaptureRegionForClause(DKind, CKind);
+    *CaptureRegion =
+        getOpenMPCaptureRegionForClause(DKind, CKind, SemaRef.LangOpts.OpenMP);
     if (*CaptureRegion != OMPD_unknown &&
         !SemaRef.CurContext->isDependentContext()) {
       ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
@@ -11450,7 +11466,7 @@ OMPClause *Sema::ActOnOpenMPNumThreadsClause(Expr *NumThreads,
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads);
+      getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads, LangOpts.OpenMP);
   if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
     ValExpr = MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -11981,8 +11997,8 @@ OMPClause *Sema::ActOnOpenMPScheduleClause(
           return nullptr;
         }
       } else if (getOpenMPCaptureRegionForClause(
-                     DSAStack->getCurrentDirective(), OMPC_schedule) !=
-                     OMPD_unknown &&
+                     DSAStack->getCurrentDirective(), OMPC_schedule,
+                     LangOpts.OpenMP) != OMPD_unknown &&
                  !CurContext->isDependentContext()) {
         ValExpr = MakeFullExpr(ValExpr).get();
         llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -14818,7 +14834,7 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(Expr *Device, SourceLocation StartLoc,
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_device);
+      getOpenMPCaptureRegionForClause(DKind, OMPC_device, LangOpts.OpenMP);
   if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
     ValExpr = MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -16270,7 +16286,7 @@ OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams,
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams);
+      getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams, LangOpts.OpenMP);
   if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
     ValExpr = MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -16296,8 +16312,8 @@ OMPClause *Sema::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
     return nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-  OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_thread_limit);
+  OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
+      DKind, OMPC_thread_limit, LangOpts.OpenMP);
   if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
     ValExpr = MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
@@ -16422,8 +16438,8 @@ OMPClause *Sema::ActOnOpenMPDistScheduleClause(
           return nullptr;
         }
       } else if (getOpenMPCaptureRegionForClause(
-                     DSAStack->getCurrentDirective(), OMPC_dist_schedule) !=
-                     OMPD_unknown &&
+                     DSAStack->getCurrentDirective(), OMPC_dist_schedule,
+                     LangOpts.OpenMP) != OMPD_unknown &&
                  !CurContext->isDependentContext()) {
         ValExpr = MakeFullExpr(ValExpr).get();
         llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index a85fb6c1dc833..e800f7fe74248 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -7004,15 +7004,21 @@ Sema::BuildExpressionFromDeclTemplateArgument(const TemplateArgument &Arg,
 
       // We might need to perform a trailing qualification conversion, since
       // the element type on the parameter could be more qualified than the
-      // element type in the expression we constructed.
+      // element type in the expression we constructed, and likewise for a
+      // function conversion.
       bool ObjCLifetimeConversion;
-      if (IsQualificationConversion(((Expr*) RefExpr.get())->getType(),
+      QualType Ignored;
+      if (IsFunctionConversion(RefExpr.get()->getType(), ParamType, Ignored) ||
+          IsQualificationConversion(RefExpr.get()->getType(),
                                     ParamType.getUnqualifiedType(), false,
                                     ObjCLifetimeConversion))
-        RefExpr = ImpCastExprToType(RefExpr.get(), ParamType.getUnqualifiedType(), CK_NoOp);
+        RefExpr = ImpCastExprToType(RefExpr.get(),
+                                    ParamType.getUnqualifiedType(), CK_NoOp);
 
+      // FIXME: We need to perform derived-to-base or base-to-derived
+      // pointer-to-member conversions here too.
       assert(!RefExpr.isInvalid() &&
-             Context.hasSameType(((Expr*) RefExpr.get())->getType(),
+             Context.hasSameType(RefExpr.get()->getType(),
                                  ParamType.getUnqualifiedType()));
       return RefExpr;
     }
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index a67ce648e4057..0ed4b8c86803b 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1576,8 +1576,12 @@ TemplateInstantiator::TransformFunctionTypeParam(ParmVarDecl *OldParm,
                                                  int indexAdjustment,
                                                Optional<unsigned> NumExpansions,
                                                  bool ExpectParameterPack) {
-  return SemaRef.SubstParmVarDecl(OldParm, TemplateArgs, indexAdjustment,
-                                  NumExpansions, ExpectParameterPack);
+  auto NewParm =
+      SemaRef.SubstParmVarDecl(OldParm, TemplateArgs, indexAdjustment,
+                               NumExpansions, ExpectParameterPack);
+  if (NewParm && SemaRef.getLangOpts().OpenCL)
+    SemaRef.deduceOpenCLAddressSpace(NewParm);
+  return NewParm;
 }
 
 QualType
@@ -2284,8 +2288,10 @@ Sema::InstantiateClass(SourceLocation PointOfInstantiation,
   CheckCompletedCXXClass(Instantiation);
 
   // Default arguments are parsed, if not instantiated. We can go instantiate
-  // default arg exprs for default constructors if necessary now.
-  ActOnFinishCXXNonNestedClass(Instantiation);
+  // default arg exprs for default constructors if necessary now. Unless we're
+  // parsing a class, in which case wait until that's finished.
+  if (ParsingClassDepth == 0)
+    ActOnFinishCXXNonNestedClass();
 
   // Instantiate late parsed attributes, and attach them to their decls.
   // See Sema::InstantiateAttrs
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index c879667333d02..8686341d65f75 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -931,6 +931,9 @@ Decl *TemplateDeclInstantiator::VisitVarDecl(VarDecl *D,
       SemaRef.inferObjCARCLifetime(Var))
     Var->setInvalidDecl();
 
+  if (SemaRef.getLangOpts().OpenCL)
+    SemaRef.deduceOpenCLAddressSpace(Var);
+
   // Substitute the nested name specifier, if any.
   if (SubstQualifier(D, Var))
     return nullptr;
@@ -3070,7 +3073,9 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
   }
   if (SubstReductionType.isNull())
     return nullptr;
-  bool IsCorrect = !SubstReductionType.isNull();
+  Expr *Combiner = D->getCombiner();
+  Expr *Init = D->getInitializer();
+  bool IsCorrect = true;
   // Create instantiated copy.
   std::pair<QualType, SourceLocation> ReductionTypes[] = {
       std::make_pair(SubstReductionType, D->getLocation())};
@@ -3085,23 +3090,10 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
       PrevDeclInScope);
   auto *NewDRD = cast<OMPDeclareReductionDecl>(DRD.get().getSingleDecl());
   SemaRef.CurrentInstantiationScope->InstantiatedLocal(D, NewDRD);
-  if (!RequiresInstantiation) {
-    if (Expr *Combiner = D->getCombiner()) {
-      NewDRD->setCombinerData(D->getCombinerIn(), D->getCombinerOut());
-      NewDRD->setCombiner(Combiner);
-      if (Expr *Init = D->getInitializer()) {
-        NewDRD->setInitializerData(D->getInitOrig(), D->getInitPriv());
-        NewDRD->setInitializer(Init, D->getInitializerKind());
-      }
-    }
-    (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(
-        /*S=*/nullptr, DRD, IsCorrect && !D->isInvalidDecl());
-    return NewDRD;
-  }
   Expr *SubstCombiner = nullptr;
   Expr *SubstInitializer = nullptr;
   // Combiners instantiation sequence.
-  if (D->getCombiner()) {
+  if (Combiner) {
     SemaRef.ActOnOpenMPDeclareReductionCombinerStart(
         /*S=*/nullptr, NewDRD);
     SemaRef.CurrentInstantiationScope->InstantiatedLocal(
@@ -3113,46 +3105,41 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
     auto *ThisContext = dyn_cast_or_null<CXXRecordDecl>(Owner);
     Sema::CXXThisScopeRAII ThisScope(SemaRef, ThisContext, Qualifiers(),
                                      ThisContext);
-    SubstCombiner = SemaRef.SubstExpr(D->getCombiner(), TemplateArgs).get();
+    SubstCombiner = SemaRef.SubstExpr(Combiner, TemplateArgs).get();
     SemaRef.ActOnOpenMPDeclareReductionCombinerEnd(NewDRD, SubstCombiner);
-    // Initializers instantiation sequence.
-    if (D->getInitializer()) {
-      VarDecl *OmpPrivParm =
-          SemaRef.ActOnOpenMPDeclareReductionInitializerStart(
-              /*S=*/nullptr, NewDRD);
-      SemaRef.CurrentInstantiationScope->InstantiatedLocal(
-          cast<DeclRefExpr>(D->getInitOrig())->getDecl(),
-          cast<DeclRefExpr>(NewDRD->getInitOrig())->getDecl());
-      SemaRef.CurrentInstantiationScope->InstantiatedLocal(
-          cast<DeclRefExpr>(D->getInitPriv())->getDecl(),
-          cast<DeclRefExpr>(NewDRD->getInitPriv())->getDecl());
-      if (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit) {
-        SubstInitializer =
-            SemaRef.SubstExpr(D->getInitializer(), TemplateArgs).get();
-      } else {
-        auto *OldPrivParm =
-            cast<VarDecl>(cast<DeclRefExpr>(D->getInitPriv())->getDecl());
-        IsCorrect = IsCorrect && OldPrivParm->hasInit();
-        if (IsCorrect)
-          SemaRef.InstantiateVariableInitializer(OmpPrivParm, OldPrivParm,
-                                                 TemplateArgs);
-      }
-      SemaRef.ActOnOpenMPDeclareReductionInitializerEnd(
-          NewDRD, SubstInitializer, OmpPrivParm);
-    }
-    IsCorrect =
-        IsCorrect && SubstCombiner &&
-        (!D->getInitializer() ||
-         (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit &&
-          SubstInitializer) ||
-         (D->getInitializerKind() != OMPDeclareReductionDecl::CallInit &&
-          !SubstInitializer && !SubstInitializer));
-  } else {
-    IsCorrect = false;
   }
-
-  (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(/*S=*/nullptr, DRD,
-                                                        IsCorrect);
+  // Initializers instantiation sequence.
+  if (Init) {
+    VarDecl *OmpPrivParm = SemaRef.ActOnOpenMPDeclareReductionInitializerStart(
+        /*S=*/nullptr, NewDRD);
+    SemaRef.CurrentInstantiationScope->InstantiatedLocal(
+        cast<DeclRefExpr>(D->getInitOrig())->getDecl(),
+        cast<DeclRefExpr>(NewDRD->getInitOrig())->getDecl());
+    SemaRef.CurrentInstantiationScope->InstantiatedLocal(
+        cast<DeclRefExpr>(D->getInitPriv())->getDecl(),
+        cast<DeclRefExpr>(NewDRD->getInitPriv())->getDecl());
+    if (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit) {
+      SubstInitializer = SemaRef.SubstExpr(Init, TemplateArgs).get();
+    } else {
+      auto *OldPrivParm =
+          cast<VarDecl>(cast<DeclRefExpr>(D->getInitPriv())->getDecl());
+      IsCorrect = IsCorrect && OldPrivParm->hasInit();
+      if (IsCorrect)
+        SemaRef.InstantiateVariableInitializer(OmpPrivParm, OldPrivParm,
+                                               TemplateArgs);
+    }
+    SemaRef.ActOnOpenMPDeclareReductionInitializerEnd(NewDRD, SubstInitializer,
+                                                      OmpPrivParm);
+  }
+  IsCorrect = IsCorrect && SubstCombiner &&
+              (!Init ||
+               (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit &&
+                SubstInitializer) ||
+               (D->getInitializerKind() != OMPDeclareReductionDecl::CallInit &&
+                !SubstInitializer));
+
+  (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(
+      /*S=*/nullptr, DRD, IsCorrect && !D->isInvalidDecl());
 
   return NewDRD;
 }
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 5f3b2d5600d6d..7de04e1228d4b 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1983,6 +1983,19 @@ bool Sema::CheckQualifiedFunctionForTypeId(QualType T, SourceLocation Loc) {
   return true;
 }
 
+// Helper to deduce addr space of a pointee type in OpenCL mode.
+static QualType deduceOpenCLPointeeAddrSpace(Sema &S, QualType PointeeType) {
+  if (!PointeeType->isUndeducedAutoType() && !PointeeType->isDependentType() &&
+      !PointeeType->isSamplerT() &&
+      !PointeeType.getQualifiers().hasAddressSpace())
+    PointeeType = S.getASTContext().getAddrSpaceQualType(
+        PointeeType,
+        S.getLangOpts().OpenCLCPlusPlus || S.getLangOpts().OpenCLVersion == 200
+            ? LangAS::opencl_generic
+            : LangAS::opencl_private);
+  return PointeeType;
+}
+
 /// Build a pointer type.
 ///
 /// \param T The type to which we'll be building a pointer.
@@ -2019,6 +2032,9 @@ QualType Sema::BuildPointerType(QualType T,
   if (getLangOpts().ObjCAutoRefCount)
     T = inferARCLifetimeForPointee(*this, T, Loc, /*reference*/ false);
 
+  if (getLangOpts().OpenCL)
+    T = deduceOpenCLPointeeAddrSpace(*this, T);
+
   // Build the pointer type.
   return Context.getPointerType(T);
 }
@@ -2079,6 +2095,9 @@ QualType Sema::BuildReferenceType(QualType T, bool SpelledAsLValue,
   if (getLangOpts().ObjCAutoRefCount)
     T = inferARCLifetimeForPointee(*this, T, Loc, /*reference*/ true);
 
+  if (getLangOpts().OpenCL)
+    T = deduceOpenCLPointeeAddrSpace(*this, T);
+
   // Handle restrict on references.
   if (LValueRef)
     return Context.getLValueReferenceType(T, SpelledAsLValue);
@@ -2664,6 +2683,9 @@ QualType Sema::BuildBlockPointerType(QualType T,
   if (checkQualifiedFunction(*this, T, Loc, QFK_BlockPointer))
     return QualType();
 
+  if (getLangOpts().OpenCL)
+    T = deduceOpenCLPointeeAddrSpace(*this, T);
+
   return Context.getBlockPointerType(T);
 }
 
@@ -4808,6 +4830,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
         FunctionProtoType::ExtProtoInfo EPI;
         EPI.ExtInfo = EI;
         EPI.Variadic = FTI.isVariadic;
+        EPI.EllipsisLoc = FTI.getEllipsisLoc();
         EPI.HasTrailingReturn = FTI.hasTrailingReturnType();
         EPI.TypeQuals.addCVRUQualifiers(
             FTI.MethodQualifiers ? FTI.MethodQualifiers->getTypeQualifiers()
@@ -4947,7 +4970,9 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
                           .getScopeRep()
                           ->getKind() == NestedNameSpecifier::TypeSpec) ||
                  state.getDeclarator().getContext() ==
-                     DeclaratorContext::MemberContext;
+                     DeclaratorContext::MemberContext ||
+                 state.getDeclarator().getContext() ==
+                     DeclaratorContext::LambdaExprContext;
         };
 
         if (state.getSema().getLangOpts().OpenCLCPlusPlus && IsClassMember()) {
@@ -4966,7 +4991,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
           // If a class member function's address space is not set, set it to
           // __generic.
           LangAS AS =
-              (ASIdx == LangAS::Default ? LangAS::opencl_generic : ASIdx);
+              (ASIdx == LangAS::Default ? S.getDefaultCXXMethodAddrSpace()
+                                        : ASIdx);
           EPI.TypeQuals.addAddressSpace(AS);
         }
         T = Context.getFunctionType(T, ParamTys, EPI);
@@ -7472,137 +7498,6 @@ static void HandleOpenCLAccessAttr(QualType &CurType, const ParsedAttr &Attr,
   }
 }
 
-static void deduceOpenCLImplicitAddrSpace(TypeProcessingState &State,
-                                          QualType &T, TypeAttrLocation TAL) {
-  Declarator &D = State.getDeclarator();
-
-  // Handle the cases where address space should not be deduced.
-  //
-  // The pointee type of a pointer type is always deduced since a pointer always
-  // points to some memory location which should has an address space.
-  //
-  // There are situations that at the point of certain declarations, the address
-  // space may be unknown and better to be left as default. For example, when
-  // defining a typedef or struct type, they are not associated with any
-  // specific address space. Later on, they may be used with any address space
-  // to declare a variable.
-  //
-  // The return value of a function is r-value, therefore should not have
-  // address space.
-  //
-  // The void type does not occupy memory, therefore should not have address
-  // space, except when it is used as a pointee type.
-  //
-  // Since LLVM assumes function type is in default address space, it should not
-  // have address space.
-  auto ChunkIndex = State.getCurrentChunkIndex();
-  bool IsPointee =
-      ChunkIndex > 0 &&
-      (D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Pointer ||
-       D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Reference ||
-       D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::BlockPointer);
-  // For pointers/references to arrays the next chunk is always an array
-  // followed by any number of parentheses.
-  if (!IsPointee && ChunkIndex > 1) {
-    auto AdjustedCI = ChunkIndex - 1;
-    if (D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Array)
-      AdjustedCI--;
-    // Skip over all parentheses.
-    while (AdjustedCI > 0 &&
-           D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Paren)
-      AdjustedCI--;
-    if (D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Pointer ||
-        D.getTypeObject(AdjustedCI).Kind == DeclaratorChunk::Reference)
-      IsPointee = true;
-  }
-  bool IsFuncReturnType =
-      ChunkIndex > 0 &&
-      D.getTypeObject(ChunkIndex - 1).Kind == DeclaratorChunk::Function;
-  bool IsFuncType =
-      ChunkIndex < D.getNumTypeObjects() &&
-      D.getTypeObject(ChunkIndex).Kind == DeclaratorChunk::Function;
-  if ( // Do not deduce addr space for function return type and function type,
-       // otherwise it will fail some sema check.
-      IsFuncReturnType || IsFuncType ||
-      // Do not deduce addr space for member types of struct, except the pointee
-      // type of a pointer member type or static data members.
-      (D.getContext() == DeclaratorContext::MemberContext &&
-       (!IsPointee &&
-        D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_static)) ||
-      // Do not deduce addr space of non-pointee in type alias because it
-      // doesn't define any object.
-      (D.getContext() == DeclaratorContext::AliasDeclContext && !IsPointee) ||
-      // Do not deduce addr space for types used to define a typedef and the
-      // typedef itself, except the pointee type of a pointer type which is used
-      // to define the typedef.
-      (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_typedef &&
-       !IsPointee) ||
-      // Do not deduce addr space of the void type, e.g. in f(void), otherwise
-      // it will fail some sema check.
-      (T->isVoidType() && !IsPointee) ||
-      // Do not deduce addr spaces for dependent types because they might end
-      // up instantiating to a type with an explicit address space qualifier.
-      // Except for pointer or reference types because the addr space in
-      // template argument can only belong to a pointee.
-      (T->isDependentType() && !T->isPointerType() && !T->isReferenceType()) ||
-      // Do not deduce addr space of decltype because it will be taken from
-      // its argument.
-      T->isDecltypeType() ||
-      // OpenCL spec v2.0 s6.9.b:
-      // The sampler type cannot be used with the __local and __global address
-      // space qualifiers.
-      // OpenCL spec v2.0 s6.13.14:
-      // Samplers can also be declared as global constants in the program
-      // source using the following syntax.
-      //   const sampler_t <sampler name> = <value>
-      // In codegen, file-scope sampler type variable has special handing and
-      // does not rely on address space qualifier. On the other hand, deducing
-      // address space of const sampler file-scope variable as global address
-      // space causes spurious diagnostic about __global address space
-      // qualifier, therefore do not deduce address space of file-scope sampler
-      // type variable.
-      (D.getContext() == DeclaratorContext::FileContext && T->isSamplerT()))
-    return;
-
-  LangAS ImpAddr = LangAS::Default;
-  // Put OpenCL automatic variable in private address space.
-  // OpenCL v1.2 s6.5:
-  // The default address space name for arguments to a function in a
-  // program, or local variables of a function is __private. All function
-  // arguments shall be in the __private address space.
-  if (State.getSema().getLangOpts().OpenCLVersion <= 120 &&
-      !State.getSema().getLangOpts().OpenCLCPlusPlus) {
-    ImpAddr = LangAS::opencl_private;
-  } else {
-    // If address space is not set, OpenCL 2.0 defines non private default
-    // address spaces for some cases:
-    // OpenCL 2.0, section 6.5:
-    // The address space for a variable at program scope or a static variable
-    // inside a function can either be __global or __constant, but defaults to
-    // __global if not specified.
-    // (...)
-    // Pointers that are declared without pointing to a named address space
-    // point to the generic address space.
-    if (IsPointee) {
-      ImpAddr = LangAS::opencl_generic;
-    } else {
-      if (D.getContext() == DeclaratorContext::TemplateArgContext) {
-        // Do not deduce address space for non-pointee type in template arg.
-      } else if (D.getContext() == DeclaratorContext::FileContext) {
-        ImpAddr = LangAS::opencl_global;
-      } else {
-        if (D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_static ||
-            D.getDeclSpec().getStorageClassSpec() == DeclSpec::SCS_extern) {
-          ImpAddr = LangAS::opencl_global;
-        } else {
-          ImpAddr = LangAS::opencl_private;
-        }
-      }
-    }
-  }
-  T = State.getSema().Context.getAddrSpaceQualType(T, ImpAddr);
-}
-
 static void HandleLifetimeBoundAttr(TypeProcessingState &State,
                                     QualType &CurType,
                                     ParsedAttr &Attr) {
@@ -7667,7 +7562,7 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
         // Otherwise, only consider type processing for a C++11 attribute if
         // it's actually been applied to a type.
         // We also allow C++11 address_space and
-        // opencl language address space attributes to pass through.
+        // OpenCL language address space attributes to pass through.
         continue;
       }
     }
@@ -7847,8 +7742,6 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
   if (!state.getSema().getLangOpts().OpenCL ||
       type.getAddressSpace() != LangAS::Default)
     return;
-
-  deduceOpenCLImplicitAddrSpace(state, type, TAL);
 }
 
 void Sema::completeExprArrayBound(Expr *E) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 3592461f8e91e..812d3a1283728 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4583,14 +4583,6 @@ QualType TreeTransform<Derived>::TransformDecayedType(TypeLocBuilder &TLB,
   return Result;
 }
 
-/// Helper to deduce addr space of a pointee type in OpenCL mode.
-/// If the type is updated it will be overwritten in PointeeType param.
-inline void deduceOpenCLPointeeAddrSpace(Sema &SemaRef, QualType &PointeeType) {
-  if (PointeeType.getAddressSpace() == LangAS::Default)
-    PointeeType = SemaRef.Context.getAddrSpaceQualType(PointeeType,
-                                                       LangAS::opencl_generic);
-}
-
 template<typename Derived>
 QualType TreeTransform<Derived>::TransformPointerType(TypeLocBuilder &TLB,
                                                       PointerTypeLoc TL) {
@@ -4599,9 +4591,6 @@ QualType TreeTransform<Derived>::TransformPointerType(TypeLocBuilder &TLB,
   if (PointeeType.isNull())
     return QualType();
 
-  if (SemaRef.getLangOpts().OpenCL)
-    deduceOpenCLPointeeAddrSpace(SemaRef, PointeeType);
-
   QualType Result = TL.getType();
   if (PointeeType->getAs<ObjCObjectType>()) {
     // A dependent pointer type 'T *' has is being transformed such
@@ -4640,9 +4629,6 @@ TreeTransform<Derived>::TransformBlockPointerType(TypeLocBuilder &TLB,
   if (PointeeType.isNull())
     return QualType();
 
-  if (SemaRef.getLangOpts().OpenCL)
-    deduceOpenCLPointeeAddrSpace(SemaRef, PointeeType);
-
   QualType Result = TL.getType();
   if (getDerived().AlwaysRebuild() ||
       PointeeType != TL.getPointeeLoc().getType()) {
@@ -4672,9 +4658,6 @@ TreeTransform<Derived>::TransformReferenceType(TypeLocBuilder &TLB,
   if (PointeeType.isNull())
     return QualType();
 
-  if (SemaRef.getLangOpts().OpenCL)
-    deduceOpenCLPointeeAddrSpace(SemaRef, PointeeType);
-
   QualType Result = TL.getType();
   if (getDerived().AlwaysRebuild() ||
       PointeeType != T->getPointeeTypeAsWritten()) {
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 8991a39a70679..d989f46c4ab4b 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -424,6 +424,8 @@ namespace clang {
     template<typename T>
     void mergeMergeable(Mergeable<T> *D);
 
+    void mergeMergeable(LifetimeExtendedTemporaryDecl *D);
+
     void mergeTemplatePattern(RedeclarableTemplateDecl *D,
                               RedeclarableTemplateDecl *Existing,
                               DeclID DsID, bool IsKeyDecl);
@@ -884,6 +886,7 @@ void ASTDeclReader::VisitFunctionDecl(FunctionDecl *FD) {
 
   FD->ODRHash = Record.readInt();
   FD->setHasODRHash(true);
+  FD->setUsesFPIntrin(Record.readInt());
 
   switch ((FunctionDecl::TemplatedKind)Record.readInt()) {
   case FunctionDecl::TK_NonTemplate:
@@ -2358,6 +2361,7 @@ void ASTDeclReader::VisitLifetimeExtendedTemporaryDecl(
   if (Record.readInt())
     D->Value = new (D->getASTContext()) APValue(Record.readAPValue());
   D->ManglingNumber = Record.readInt();
+  mergeMergeable(D);
 }
 
 std::pair<uint64_t, uint64_t>
@@ -2555,6 +2559,25 @@ static bool allowODRLikeMergeInC(NamedDecl *ND) {
   return false;
 }
 
+/// Attempts to merge LifetimeExtendedTemporaryDecl with
+/// identical class definitions from two different modules.
+void ASTDeclReader::mergeMergeable(LifetimeExtendedTemporaryDecl *D) {
+  // If modules are not available, there is no reason to perform this merge.
+  if (!Reader.getContext().getLangOpts().Modules)
+    return;
+
+  LifetimeExtendedTemporaryDecl *LETDecl = D;
+
+  LifetimeExtendedTemporaryDecl *&LookupResult =
+      Reader.LETemporaryForMerging[std::make_pair(
+          LETDecl->getExtendingDecl(), LETDecl->getManglingNumber())];
+  if (LookupResult)
+    Reader.getContext().setPrimaryMergedDecl(LETDecl,
+                                             LookupResult->getCanonicalDecl());
+  else
+    LookupResult = LETDecl;
+}
+
 /// Attempts to merge the given declaration (D) with another declaration
 /// of the same entity, for the case where the entity is not actually
 /// redeclarable. This happens, for instance, when merging the fields of
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 51902a607ca12..38eb64e52e4ac 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -559,6 +559,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   Record.AddSourceLocation(D->getEndLoc());
 
   Record.push_back(D->getODRHash());
+  Record.push_back(D->usesFPIntrin());
 
   Record.push_back(D->getTemplatedKind());
   switch (D->getTemplatedKind()) {
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index a824499518730..01c7afe520410 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -2525,19 +2525,18 @@ MallocChecker::LeakInfo MallocChecker::getAllocationSite(const ExplodedNode *N,
 
     // Find the most recent expression bound to the symbol in the current
     // context.
-      if (!ReferenceRegion) {
-        if (const MemRegion *MR = C.getLocationRegionIfPostStore(N)) {
-          SVal Val = State->getSVal(MR);
-          if (Val.getAsLocSymbol() == Sym) {
-            const VarRegion* VR = MR->getBaseRegion()->getAs<VarRegion>();
-            // Do not show local variables belonging to a function other than
-            // where the error is reported.
-            if (!VR ||
-                (VR->getStackFrame() == LeakContext->getStackFrame()))
-              ReferenceRegion = MR;
-          }
+    if (!ReferenceRegion) {
+      if (const MemRegion *MR = C.getLocationRegionIfPostStore(N)) {
+        SVal Val = State->getSVal(MR);
+        if (Val.getAsLocSymbol() == Sym) {
+          const VarRegion *VR = MR->getBaseRegion()->getAs<VarRegion>();
+          // Do not show local variables belonging to a function other than
+          // where the error is reported.
+          if (!VR || (VR->getStackFrame() == LeakContext->getStackFrame()))
+            ReferenceRegion = MR;
         }
       }
+    }
 
     // Allocation node, is the last node in the current or parent context in
     // which the symbol was tracked.
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 7a0dda563282d..47099f2afb6a4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -64,6 +64,7 @@ class StreamChecker : public Checker<eval::Call,
 
   CallDescriptionMap<FnCheck> Callbacks = {
       {{"fopen"}, &StreamChecker::evalFopen},
+      {{"freopen", 3}, &StreamChecker::evalFreopen},
       {{"tmpfile"}, &StreamChecker::evalFopen},
       {{"fclose", 1}, &StreamChecker::evalFclose},
       {{"fread", 4},
@@ -90,6 +91,7 @@ class StreamChecker : public Checker<eval::Call,
   };
 
   void evalFopen(const CallEvent &Call, CheckerContext &C) const;
+  void evalFreopen(const CallEvent &Call, CheckerContext &C) const;
   void evalFclose(const CallEvent &Call, CheckerContext &C) const;
   void evalFseek(const CallEvent &Call, CheckerContext &C) const;
 
@@ -160,6 +162,49 @@ void StreamChecker::evalFopen(const CallEvent &Call, CheckerContext &C) const {
   C.addTransition(stateNull);
 }
 
+void StreamChecker::evalFreopen(const CallEvent &Call,
+                                CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+
+  auto *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
+  if (!CE)
+    return;
+
+  Optional<DefinedSVal> StreamVal = Call.getArgSVal(2).getAs<DefinedSVal>();
+  if (!StreamVal)
+    return;
+  // Do not allow NULL as passed stream pointer.
+  // This is not specified in the man page but may crash on some system.
+  checkNullStream(*StreamVal, C, State);
+  // Check if error was generated.
+  if (C.isDifferent())
+    return;
+
+  SymbolRef StreamSym = StreamVal->getAsSymbol();
+  // Do not care about special values for stream ("(FILE *)0x12345"?).
+  if (!StreamSym)
+    return;
+
+  // Generate state for non-failed case.
+  // Return value is the passed stream pointer.
+  // According to the documentations, the stream is closed first
+  // but any close error is ignored. The state changes to (or remains) opened.
+  ProgramStateRef StateRetNotNull =
+      State->BindExpr(CE, C.getLocationContext(), *StreamVal);
+  // Generate state for NULL return value.
+  // Stream switches to OpenFailed state.
+  ProgramStateRef StateRetNull = State->BindExpr(CE, C.getLocationContext(),
+                                                 C.getSValBuilder().makeNull());
+
+  StateRetNotNull =
+      StateRetNotNull->set<StreamMap>(StreamSym, StreamState::getOpened());
+  StateRetNull =
+      StateRetNull->set<StreamMap>(StreamSym, StreamState::getOpenFailed());
+
+  C.addTransition(StateRetNotNull);
+  C.addTransition(StateRetNull);
+}
+
 void StreamChecker::evalFclose(const CallEvent &Call, CheckerContext &C) const {
   ProgramStateRef State = C.getState();
   if (checkDoubleClose(Call, C, State))
diff --git a/clang/lib/Tooling/CMakeLists.txt b/clang/lib/Tooling/CMakeLists.txt
index 05061f0a10a80..59c990daaa29f 100644
--- a/clang/lib/Tooling/CMakeLists.txt
+++ b/clang/lib/Tooling/CMakeLists.txt
@@ -17,6 +17,7 @@ add_clang_library(clangTooling
   CommonOptionsParser.cpp
   CompilationDatabase.cpp
   Execution.cpp
+  ExpandResponseFilesCompilationDatabase.cpp
   FileMatchTrie.cpp
   FixIt.cpp
   GuessTargetAndModeCompilationDatabase.cpp
diff --git a/clang/lib/Tooling/ExpandResponseFilesCompilationDatabase.cpp b/clang/lib/Tooling/ExpandResponseFilesCompilationDatabase.cpp
new file mode 100644
index 0000000000000..84936ba05b20f
--- /dev/null
+++ b/clang/lib/Tooling/ExpandResponseFilesCompilationDatabase.cpp
@@ -0,0 +1,88 @@
+//===- ExpandResponseFileCompilationDataBase.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Tooling/CompilationDatabase.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/StringSaver.h"
+
+namespace clang {
+namespace tooling {
+namespace {
+
+class ExpandResponseFilesDatabase : public CompilationDatabase {
+public:
+  ExpandResponseFilesDatabase(
+      std::unique_ptr<CompilationDatabase> Base,
+      llvm::cl::TokenizerCallback Tokenizer,
+      llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS)
+      : Base(std::move(Base)), Tokenizer(Tokenizer), FS(std::move(FS)) {
+    assert(this->Base != nullptr);
+    assert(this->Tokenizer != nullptr);
+    assert(this->FS != nullptr);
+  }
+
+  std::vector<std::string> getAllFiles() const override {
+    return Base->getAllFiles();
+  }
+
+  std::vector<CompileCommand>
+  getCompileCommands(StringRef FilePath) const override {
+    return expand(Base->getCompileCommands(FilePath));
+  }
+
+  std::vector<CompileCommand> getAllCompileCommands() const override {
+    return expand(Base->getAllCompileCommands());
+  }
+
+private:
+  std::vector<CompileCommand> expand(std::vector<CompileCommand> Cmds) const {
+    for (auto &Cmd : Cmds) {
+      bool SeenRSPFile = false;
+      llvm::SmallVector<const char *, 20> Argv;
+      Argv.reserve(Cmd.CommandLine.size());
+      for (auto &Arg : Cmd.CommandLine) {
+        Argv.push_back(Arg.c_str());
+        SeenRSPFile |= Arg.front() == '@';
+      }
+      if (!SeenRSPFile)
+        continue;
+      llvm::BumpPtrAllocator Alloc;
+      llvm::StringSaver Saver(Alloc);
+      llvm::cl::ExpandResponseFiles(Saver, Tokenizer, Argv, false, false, *FS,
+                                    llvm::StringRef(Cmd.Directory));
+      Cmd.CommandLine.assign(Argv.begin(), Argv.end());
+    }
+    return Cmds;
+  }
+
+private:
+  std::unique_ptr<CompilationDatabase> Base;
+  llvm::cl::TokenizerCallback Tokenizer;
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS;
+};
+
+} // namespace
+
+std::unique_ptr<CompilationDatabase>
+expandResponseFiles(std::unique_ptr<CompilationDatabase> Base,
+                    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
+  auto Tokenizer = llvm::Triple(llvm::sys::getProcessTriple()).isOSWindows()
+                       ? llvm::cl::TokenizeWindowsCommandLine
+                       : llvm::cl::TokenizeGNUCommandLine;
+  return std::make_unique<ExpandResponseFilesDatabase>(
+      std::move(Base), Tokenizer, std::move(FS));
+}
+
+} // namespace tooling
+} // namespace clang
diff --git a/clang/lib/Tooling/JSONCompilationDatabase.cpp b/clang/lib/Tooling/JSONCompilationDatabase.cpp
index f19a0f7550b96..04dd4dbf62484 100644
--- a/clang/lib/Tooling/JSONCompilationDatabase.cpp
+++ b/clang/lib/Tooling/JSONCompilationDatabase.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -168,7 +169,8 @@ class JSONCompilationDatabasePlugin : public CompilationDatabasePlugin {
     auto Base = JSONCompilationDatabase::loadFromFile(
         JSONDatabasePath, ErrorMessage, JSONCommandLineSyntax::AutoDetect);
     return Base ? inferTargetAndDriverMode(
-                      inferMissingCompileCommands(std::move(Base)))
+                      inferMissingCompileCommands(expandResponseFiles(
+                          std::move(Base), llvm::vfs::getRealFileSystem())))
                 : nullptr;
   }
 };
diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp
index dddc265c8c416..67081497d04c0 100644
--- a/clang/lib/Tooling/Syntax/BuildTree.cpp
+++ b/clang/lib/Tooling/Syntax/BuildTree.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "clang/Tooling/Syntax/BuildTree.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Stmt.h"
 #include "clang/Basic/LLVM.h"
@@ -56,6 +58,14 @@ class syntax::TreeBuilder {
   /// Range.
   void foldNode(llvm::ArrayRef<syntax::Token> Range, syntax::Tree *New);
 
+  /// Must be called with the range of each `DeclaratorDecl`. Ensures the
+  /// corresponding declarator nodes are covered by `SimpleDeclaration`.
+  void noticeDeclaratorRange(llvm::ArrayRef<syntax::Token> Range);
+
+  /// Notifies that we should not consume trailing semicolon when computing
+  /// token range of \p D.
+  void noticeDeclaratorWithoutSemicolon(Decl *D);
+
   /// Mark the \p Child node with a corresponding \p Role. All marked children
   /// should be consumed by foldNode.
   /// (!) when called on expressions (clang::Expr is derived from clang::Stmt),
@@ -66,7 +76,7 @@ class syntax::TreeBuilder {
   void markExprChild(Expr *Child, NodeRole Role);
 
   /// Set role for a token starting at \p Loc.
-  void markChildToken(SourceLocation Loc, tok::TokenKind Kind, NodeRole R);
+  void markChildToken(SourceLocation Loc, NodeRole R);
 
   /// Finish building the tree and consume the root node.
   syntax::TranslationUnit *finalize() && {
@@ -94,7 +104,14 @@ class syntax::TreeBuilder {
     return llvm::makeArrayRef(findToken(First), std::next(findToken(Last)));
   }
   llvm::ArrayRef<syntax::Token> getRange(const Decl *D) const {
-    return getRange(D->getBeginLoc(), D->getEndLoc());
+    auto Tokens = getRange(D->getBeginLoc(), D->getEndLoc());
+    if (llvm::isa<NamespaceDecl>(D))
+      return Tokens;
+    if (DeclsWithoutSemicolons.count(D))
+      return Tokens;
+    // FIXME: do not consume trailing semicolon on function definitions.
+    // Most declarations own a semicolon in syntax trees, but not in clang AST.
+    return withTrailingSemicolon(Tokens);
   }
   llvm::ArrayRef<syntax::Token> getExprRange(const Expr *E) const {
     return getRange(E->getBeginLoc(), E->getEndLoc());
@@ -108,14 +125,22 @@ class syntax::TreeBuilder {
 
     // Some statements miss a trailing semicolon, e.g. 'return', 'continue' and
     // all statements that end with those. Consume this semicolon here.
-    //
-    // (!) statements never consume 'eof', so looking at the next token is ok.
+    if (Tokens.back().kind() == tok::semi)
+      return Tokens;
+    return withTrailingSemicolon(Tokens);
+  }
+
+private:
+  llvm::ArrayRef<syntax::Token>
+  withTrailingSemicolon(llvm::ArrayRef<syntax::Token> Tokens) const {
+    assert(!Tokens.empty());
+    assert(Tokens.back().kind() != tok::eof);
+    // (!) we never consume 'eof', so looking at the next token is ok.
     if (Tokens.back().kind() != tok::semi && Tokens.end()->kind() == tok::semi)
       return llvm::makeArrayRef(Tokens.begin(), Tokens.end() + 1);
     return Tokens;
   }
 
-private:
   /// Finds a token starting at \p L. The token must exist.
   const syntax::Token *findToken(SourceLocation L) const;
 
@@ -136,6 +161,8 @@ class syntax::TreeBuilder {
                      {&T, NodeAndRole{new (A.allocator()) syntax::Leaf(&T)}});
     }
 
+    ~Forest() { assert(DelayedFolds.empty()); }
+
     void assignRole(llvm::ArrayRef<syntax::Token> Range,
                     syntax::NodeRole Role) {
       assert(!Range.empty());
@@ -148,30 +175,46 @@ class syntax::TreeBuilder {
       It->second.Role = Role;
     }
 
-    /// Add \p Node to the forest and fill its children nodes based on the \p
-    /// NodeRange.
-    void foldChildren(llvm::ArrayRef<syntax::Token> NodeTokens,
+    /// Add \p Node to the forest and attach child nodes based on \p Tokens.
+    void foldChildren(llvm::ArrayRef<syntax::Token> Tokens,
                       syntax::Tree *Node) {
-      assert(!NodeTokens.empty());
-      assert(Node->firstChild() == nullptr && "node already has children");
-
-      auto *FirstToken = NodeTokens.begin();
-      auto BeginChildren = Trees.lower_bound(FirstToken);
-      assert(BeginChildren != Trees.end() &&
-             BeginChildren->first == FirstToken &&
-             "fold crosses boundaries of existing subtrees");
-      auto EndChildren = Trees.lower_bound(NodeTokens.end());
-      assert((EndChildren == Trees.end() ||
-              EndChildren->first == NodeTokens.end()) &&
-             "fold crosses boundaries of existing subtrees");
+      // Execute delayed folds inside `Tokens`.
+      auto BeginExecuted = DelayedFolds.lower_bound(Tokens.begin());
+      auto It = BeginExecuted;
+      for (; It != DelayedFolds.end() && It->second.End <= Tokens.end(); ++It)
+        foldChildrenEager(llvm::makeArrayRef(It->first, It->second.End),
+                          It->second.Node);
+      DelayedFolds.erase(BeginExecuted, It);
+
+      // Attach children to `Node`.
+      foldChildrenEager(Tokens, Node);
+    }
 
-      // (!) we need to go in reverse order, because we can only prepend.
-      for (auto It = EndChildren; It != BeginChildren; --It)
-        Node->prependChildLowLevel(std::prev(It)->second.Node,
-                                   std::prev(It)->second.Role);
+    /// Schedule a call to `foldChildren` that will only be executed when
+    /// containing node is folded. The range of delayed nodes can be extended by
+    /// calling `extendDelayedFold`. Only one delayed node for each starting
+    /// token is allowed.
+    void foldChildrenDelayed(llvm::ArrayRef<syntax::Token> Tokens,
+                             syntax::Tree *Node) {
+      assert(!Tokens.empty());
+      bool Inserted =
+          DelayedFolds.insert({Tokens.begin(), DelayedFold{Tokens.end(), Node}})
+              .second;
+      (void)Inserted;
+      assert(Inserted && "Multiple delayed folds start at the same token");
+    }
 
-      Trees.erase(BeginChildren, EndChildren);
-      Trees.insert({FirstToken, NodeAndRole(Node)});
+    /// If there a delayed fold, starting at `ExtendedRange.begin()`, extends
+    /// its endpoint to `ExtendedRange.end()` and returns true.
+    /// Otherwise, returns false.
+    bool extendDelayedFold(llvm::ArrayRef<syntax::Token> ExtendedRange) {
+      assert(!ExtendedRange.empty());
+      auto It = DelayedFolds.find(ExtendedRange.data());
+      if (It == DelayedFolds.end())
+        return false;
+      assert(It->second.End <= ExtendedRange.end());
+      It->second.End = ExtendedRange.end();
+      return true;
     }
 
     // EXPECTS: all tokens were consumed and are owned by a single root node.
@@ -199,6 +242,30 @@ class syntax::TreeBuilder {
     }
 
   private:
+    /// Implementation detail of `foldChildren`, does acutal folding ignoring
+    /// delayed folds.
+    void foldChildrenEager(llvm::ArrayRef<syntax::Token> Tokens,
+                           syntax::Tree *Node) {
+      assert(Node->firstChild() == nullptr && "node already has children");
+
+      auto *FirstToken = Tokens.begin();
+      auto BeginChildren = Trees.lower_bound(FirstToken);
+      assert((BeginChildren == Trees.end() ||
+              BeginChildren->first == FirstToken) &&
+             "fold crosses boundaries of existing subtrees");
+      auto EndChildren = Trees.lower_bound(Tokens.end());
+      assert(
+          (EndChildren == Trees.end() || EndChildren->first == Tokens.end()) &&
+          "fold crosses boundaries of existing subtrees");
+
+      // (!) we need to go in reverse order, because we can only prepend.
+      for (auto It = EndChildren; It != BeginChildren; --It)
+        Node->prependChildLowLevel(std::prev(It)->second.Node,
+                                   std::prev(It)->second.Role);
+
+      Trees.erase(BeginChildren, EndChildren);
+      Trees.insert({FirstToken, NodeAndRole(Node)});
+    }
     /// A with a role that should be assigned to it when adding to a parent.
     struct NodeAndRole {
       explicit NodeAndRole(syntax::Node *Node)
@@ -209,9 +276,18 @@ class syntax::TreeBuilder {
     };
 
     /// Maps from the start token to a subtree starting at that token.
+    /// Keys in the map are pointers into the array of expanded tokens, so
+    /// pointer order corresponds to the order of preprocessor tokens.
     /// FIXME: storing the end tokens is redundant.
     /// FIXME: the key of a map is redundant, it is also stored in NodeForRange.
     std::map<const syntax::Token *, NodeAndRole> Trees;
+
+    /// See documentation of `foldChildrenDelayed` for details.
+    struct DelayedFold {
+      const syntax::Token *End = nullptr;
+      syntax::Tree *Node = nullptr;
+    };
+    std::map<const syntax::Token *, DelayedFold> DelayedFolds;
   };
 
   /// For debugging purposes.
@@ -219,6 +295,7 @@ class syntax::TreeBuilder {
 
   syntax::Arena &Arena;
   Forest Pending;
+  llvm::DenseSet<Decl*> DeclsWithoutSemicolons;
 };
 
 namespace {
@@ -229,20 +306,30 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
 
   bool shouldTraversePostOrder() const { return true; }
 
-  bool TraverseDecl(Decl *D) {
-    if (!D || isa<TranslationUnitDecl>(D))
-      return RecursiveASTVisitor::TraverseDecl(D);
-    if (!llvm::isa<TranslationUnitDecl>(D->getDeclContext()))
-      return true; // Only build top-level decls for now, do not recurse.
-    return RecursiveASTVisitor::TraverseDecl(D);
+  bool WalkUpFromDeclaratorDecl(DeclaratorDecl *D) {
+    // Ensure declarators are covered by SimpleDeclaration.
+    Builder.noticeDeclaratorRange(Builder.getRange(D));
+    // FIXME: build nodes for the declarator too.
+    return true;
+  }
+  bool WalkUpFromTypedefNameDecl(TypedefNameDecl *D) {
+    // Also a declarator.
+    Builder.noticeDeclaratorRange(Builder.getRange(D));
+    // FIXME: build nodes for the declarator too.
+    return true;
   }
 
   bool VisitDecl(Decl *D) {
-    assert(llvm::isa<TranslationUnitDecl>(D->getDeclContext()) &&
-           "expected a top-level decl");
     assert(!D->isImplicit());
     Builder.foldNode(Builder.getRange(D),
-                     new (allocator()) syntax::TopLevelDeclaration());
+                     new (allocator()) syntax::UnknownDeclaration());
+    return true;
+  }
+
+  bool WalkUpFromTagDecl(TagDecl *C) {
+    // Avoid building UnknownDeclaration here, syntatically 'struct X {}' and
+    // similar are part of declaration specifiers and do not introduce a new
+    // top-level declaration.
     return true;
   }
 
@@ -255,11 +342,10 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   bool WalkUpFromCompoundStmt(CompoundStmt *S) {
     using NodeRole = syntax::NodeRole;
 
-    Builder.markChildToken(S->getLBracLoc(), tok::l_brace, NodeRole::OpenParen);
+    Builder.markChildToken(S->getLBracLoc(), NodeRole::OpenParen);
     for (auto *Child : S->body())
       Builder.markStmtChild(Child, NodeRole::CompoundStatement_statement);
-    Builder.markChildToken(S->getRBracLoc(), tok::r_brace,
-                           NodeRole::CloseParen);
+    Builder.markChildToken(S->getRBracLoc(), NodeRole::CloseParen);
 
     Builder.foldNode(Builder.getStmtRange(S),
                      new (allocator()) syntax::CompoundStatement);
@@ -290,7 +376,11 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool TraverseStmt(Stmt *S) {
-    if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
+    if (auto *DS = llvm::dyn_cast_or_null<DeclStmt>(S)) {
+      // We want to consume the semicolon, make sure SimpleDeclaration does not.
+      for (auto *D : DS->decls())
+        Builder.noticeDeclaratorWithoutSemicolon(D);
+    } else if (auto *E = llvm::dyn_cast_or_null<Expr>(S)) {
       // (!) do not recurse into subexpressions.
       // we do not have syntax trees for expressions yet, so we only want to see
       // the first top-level expression.
@@ -323,7 +413,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromSwitchStmt(SwitchStmt *S) {
-    Builder.markChildToken(S->getSwitchLoc(), tok::kw_switch,
+    Builder.markChildToken(S->getSwitchLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement);
     Builder.foldNode(Builder.getStmtRange(S),
@@ -332,7 +422,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromCaseStmt(CaseStmt *S) {
-    Builder.markChildToken(S->getKeywordLoc(), tok::kw_case,
+    Builder.markChildToken(S->getKeywordLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.markExprChild(S->getLHS(), syntax::NodeRole::CaseStatement_value);
     Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement);
@@ -342,7 +432,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromDefaultStmt(DefaultStmt *S) {
-    Builder.markChildToken(S->getKeywordLoc(), tok::kw_default,
+    Builder.markChildToken(S->getKeywordLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement);
     Builder.foldNode(Builder.getStmtRange(S),
@@ -351,11 +441,10 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromIfStmt(IfStmt *S) {
-    Builder.markChildToken(S->getIfLoc(), tok::kw_if,
-                           syntax::NodeRole::IntroducerKeyword);
+    Builder.markChildToken(S->getIfLoc(), syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getThen(),
                           syntax::NodeRole::IfStatement_thenStatement);
-    Builder.markChildToken(S->getElseLoc(), tok::kw_else,
+    Builder.markChildToken(S->getElseLoc(),
                            syntax::NodeRole::IfStatement_elseKeyword);
     Builder.markStmtChild(S->getElse(),
                           syntax::NodeRole::IfStatement_elseStatement);
@@ -365,8 +454,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromForStmt(ForStmt *S) {
-    Builder.markChildToken(S->getForLoc(), tok::kw_for,
-                           syntax::NodeRole::IntroducerKeyword);
+    Builder.markChildToken(S->getForLoc(), syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement);
     Builder.foldNode(Builder.getStmtRange(S),
                      new (allocator()) syntax::ForStatement);
@@ -374,7 +462,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromWhileStmt(WhileStmt *S) {
-    Builder.markChildToken(S->getWhileLoc(), tok::kw_while,
+    Builder.markChildToken(S->getWhileLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement);
     Builder.foldNode(Builder.getStmtRange(S),
@@ -383,7 +471,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromContinueStmt(ContinueStmt *S) {
-    Builder.markChildToken(S->getContinueLoc(), tok::kw_continue,
+    Builder.markChildToken(S->getContinueLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.foldNode(Builder.getStmtRange(S),
                      new (allocator()) syntax::ContinueStatement);
@@ -391,7 +479,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromBreakStmt(BreakStmt *S) {
-    Builder.markChildToken(S->getBreakLoc(), tok::kw_break,
+    Builder.markChildToken(S->getBreakLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.foldNode(Builder.getStmtRange(S),
                      new (allocator()) syntax::BreakStatement);
@@ -399,7 +487,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromReturnStmt(ReturnStmt *S) {
-    Builder.markChildToken(S->getReturnLoc(), tok::kw_return,
+    Builder.markChildToken(S->getReturnLoc(),
                            syntax::NodeRole::IntroducerKeyword);
     Builder.markExprChild(S->getRetValue(),
                           syntax::NodeRole::ReturnStatement_value);
@@ -409,8 +497,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor<BuildTreeVisitor> {
   }
 
   bool WalkUpFromCXXForRangeStmt(CXXForRangeStmt *S) {
-    Builder.markChildToken(S->getForLoc(), tok::kw_for,
-                           syntax::NodeRole::IntroducerKeyword);
+    Builder.markChildToken(S->getForLoc(), syntax::NodeRole::IntroducerKeyword);
     Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement);
     Builder.foldNode(Builder.getStmtRange(S),
                      new (allocator()) syntax::RangeBasedForStatement);
@@ -431,8 +518,19 @@ void syntax::TreeBuilder::foldNode(llvm::ArrayRef<syntax::Token> Range,
   Pending.foldChildren(Range, New);
 }
 
-void syntax::TreeBuilder::markChildToken(SourceLocation Loc,
-                                         tok::TokenKind Kind, NodeRole Role) {
+void syntax::TreeBuilder::noticeDeclaratorRange(
+    llvm::ArrayRef<syntax::Token> Range) {
+  if (Pending.extendDelayedFold(Range))
+    return;
+  Pending.foldChildrenDelayed(Range,
+                              new (allocator()) syntax::SimpleDeclaration);
+}
+
+void syntax::TreeBuilder::noticeDeclaratorWithoutSemicolon(Decl *D) {
+  DeclsWithoutSemicolons.insert(D);
+}
+
+void syntax::TreeBuilder::markChildToken(SourceLocation Loc, NodeRole Role) {
   if (Loc.isInvalid())
     return;
   Pending.assignRole(*findToken(Loc), Role);
diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp
index 776330ab585fc..b2ed4ffa22c2b 100644
--- a/clang/lib/Tooling/Syntax/Nodes.cpp
+++ b/clang/lib/Tooling/Syntax/Nodes.cpp
@@ -16,8 +16,6 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
     return OS << "Leaf";
   case NodeKind::TranslationUnit:
     return OS << "TranslationUnit";
-  case NodeKind::TopLevelDeclaration:
-    return OS << "TopLevelDeclaration";
   case NodeKind::UnknownExpression:
     return OS << "UnknownExpression";
   case NodeKind::UnknownStatement:
@@ -50,6 +48,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) {
     return OS << "ExpressionStatement";
   case NodeKind::CompoundStatement:
     return OS << "CompoundStatement";
+  case NodeKind::UnknownDeclaration:
+    return OS << "UnknownDeclaration";
+  case NodeKind::SimpleDeclaration:
+    return OS << "SimpleDeclaration";
   }
   llvm_unreachable("unknown node kind");
 }
diff --git a/clang/lib/Tooling/Syntax/Tokens.cpp b/clang/lib/Tooling/Syntax/Tokens.cpp
index a2c3bc137d6ba..5941507e086d2 100644
--- a/clang/lib/Tooling/Syntax/Tokens.cpp
+++ b/clang/lib/Tooling/Syntax/Tokens.cpp
@@ -119,6 +119,22 @@ llvm::StringRef FileRange::text(const SourceManager &SM) const {
   return Text.substr(Begin, length());
 }
 
+llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const {
+  if (R.isInvalid())
+    return {};
+  const Token *Begin =
+      llvm::partition_point(expandedTokens(), [&](const syntax::Token &T) {
+        return SourceMgr->isBeforeInTranslationUnit(T.location(), R.getBegin());
+      });
+  const Token *End =
+      llvm::partition_point(expandedTokens(), [&](const syntax::Token &T) {
+        return !SourceMgr->isBeforeInTranslationUnit(R.getEnd(), T.location());
+      });
+  if (Begin > End)
+    return {};
+  return {Begin, End};
+}
+
 std::pair<const syntax::Token *, const TokenBuffer::Mapping *>
 TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const {
   assert(Expanded);
diff --git a/clang/test/AST/Interp/cond.cpp b/clang/test/AST/Interp/cond.cpp
index 8a5a318c216d9..1fc69ed333e15 100644
--- a/clang/test/AST/Interp/cond.cpp
+++ b/clang/test/AST/Interp/cond.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++17 -fsyntax-only -fforce-experimental-new-constant-interpreter %s -verify
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -fexperimental-new-constant-interpreter %s -verify
 // RUN: %clang_cc1 -std=c++17 -fsyntax-only %s -verify
 // expected-no-diagnostics
 
diff --git a/clang/test/AST/ast-dump-file-line-json.c b/clang/test/AST/ast-dump-file-line-json.c
new file mode 100644
index 0000000000000..89807cb8274ed
--- /dev/null
+++ b/clang/test/AST/ast-dump-file-line-json.c
@@ -0,0 +1,309 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ast-dump=json %s | FileCheck %s
+
+#line 4 "test.c"
+int a;
+
+#line 32 "bar.h"
+int b;
+int c;
+
+#line 11 "test.c"
+int d;
+int e;
+// NOTE: CHECK lines have been autogenerated by gen_ast_dump_json_test.py
+
+
+// CHECK:  "kind": "TranslationUnitDecl",
+// CHECK-NEXT:  "loc": {},
+// CHECK-NEXT:  "range": {
+// CHECK-NEXT:   "begin": {},
+// CHECK-NEXT:   "end": {}
+// CHECK-NEXT:  },
+// CHECK-NEXT:  "inner": [
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__int128_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "__int128"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "__int128"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__uint128_t",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "unsigned __int128"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "BuiltinType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "unsigned __int128"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__NSConstantString",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "struct __NSConstantString_tag"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "RecordType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "struct __NSConstantString_tag"
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "decl": {
+// CHECK-NEXT:       "id": "0x{{.*}}",
+// CHECK-NEXT:       "kind": "RecordDecl",
+// CHECK-NEXT:       "name": "__NSConstantString_tag"
+// CHECK-NEXT:      }
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__builtin_ms_va_list",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "char *"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "PointerType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "char *"
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "inner": [
+// CHECK-NEXT:       {
+// CHECK-NEXT:        "id": "0x{{.*}}",
+// CHECK-NEXT:        "kind": "BuiltinType",
+// CHECK-NEXT:        "type": {
+// CHECK-NEXT:         "qualType": "char"
+// CHECK-NEXT:        }
+// CHECK-NEXT:       }
+// CHECK-NEXT:      ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "TypedefDecl",
+// CHECK-NEXT:    "loc": {},
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {},
+// CHECK-NEXT:     "end": {}
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "isImplicit": true,
+// CHECK-NEXT:    "name": "__builtin_va_list",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "struct __va_list_tag [1]"
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "inner": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:      "id": "0x{{.*}}",
+// CHECK-NEXT:      "kind": "ConstantArrayType",
+// CHECK-NEXT:      "type": {
+// CHECK-NEXT:       "qualType": "struct __va_list_tag [1]"
+// CHECK-NEXT:      },
+// CHECK-NEXT:      "size": 1,
+// CHECK-NEXT:      "inner": [
+// CHECK-NEXT:       {
+// CHECK-NEXT:        "id": "0x{{.*}}",
+// CHECK-NEXT:        "kind": "RecordType",
+// CHECK-NEXT:        "type": {
+// CHECK-NEXT:         "qualType": "struct __va_list_tag"
+// CHECK-NEXT:        },
+// CHECK-NEXT:        "decl": {
+// CHECK-NEXT:         "id": "0x{{.*}}",
+// CHECK-NEXT:         "kind": "RecordDecl",
+// CHECK-NEXT:         "name": "__va_list_tag"
+// CHECK-NEXT:        }
+// CHECK-NEXT:       }
+// CHECK-NEXT:      ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:    ]
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VarDecl",
+// CHECK-NEXT:    "loc": {
+// CHECK-NEXT:     "offset": 105,
+// CHECK-NEXT:     "file": "{{.*}}",
+// CHECK-NEXT:     "line": 4,
+// CHECK-NEXT:     "presumedFile": "test.c",
+// CHECK-NEXT:     "col": 5,
+// CHECK-NEXT:     "tokLen": 1
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {
+// CHECK-NEXT:      "offset": 101,
+// CHECK-NEXT:      "col": 1,
+// CHECK-NEXT:      "tokLen": 3
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "end": {
+// CHECK-NEXT:      "offset": 105,
+// CHECK-NEXT:      "col": 5,
+// CHECK-NEXT:      "tokLen": 1
+// CHECK-NEXT:     }
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "name": "a",
+// CHECK-NEXT:    "mangledName": "a",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "int"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VarDecl",
+// CHECK-NEXT:    "loc": {
+// CHECK-NEXT:     "offset": 130,
+// CHECK-NEXT:     "line": 7,
+// CHECK-NEXT:     "presumedFile": "bar.h",
+// CHECK-NEXT:     "presumedLine": 32,
+// CHECK-NEXT:     "col": 5,
+// CHECK-NEXT:     "tokLen": 1
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {
+// CHECK-NEXT:      "offset": 126,
+// CHECK-NEXT:      "col": 1,
+// CHECK-NEXT:      "tokLen": 3
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "end": {
+// CHECK-NEXT:      "offset": 130,
+// CHECK-NEXT:      "col": 5,
+// CHECK-NEXT:      "tokLen": 1
+// CHECK-NEXT:     }
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "name": "b",
+// CHECK-NEXT:    "mangledName": "b",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "int"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VarDecl",
+// CHECK-NEXT:    "loc": {
+// CHECK-NEXT:     "offset": 137,
+// CHECK-NEXT:     "line": 8,
+// CHECK-NEXT:     "presumedLine": 33,
+// CHECK-NEXT:     "col": 5,
+// CHECK-NEXT:     "tokLen": 1
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {
+// CHECK-NEXT:      "offset": 133,
+// CHECK-NEXT:      "col": 1,
+// CHECK-NEXT:      "tokLen": 3
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "end": {
+// CHECK-NEXT:      "offset": 137,
+// CHECK-NEXT:      "col": 5,
+// CHECK-NEXT:      "tokLen": 1
+// CHECK-NEXT:     }
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "name": "c",
+// CHECK-NEXT:    "mangledName": "c",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "int"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VarDecl",
+// CHECK-NEXT:    "loc": {
+// CHECK-NEXT:     "offset": 163,
+// CHECK-NEXT:     "line": 11,
+// CHECK-NEXT:     "presumedFile": "test.c",
+// CHECK-NEXT:     "col": 5,
+// CHECK-NEXT:     "tokLen": 1
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {
+// CHECK-NEXT:      "offset": 159,
+// CHECK-NEXT:      "col": 1,
+// CHECK-NEXT:      "tokLen": 3
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "end": {
+// CHECK-NEXT:      "offset": 163,
+// CHECK-NEXT:      "col": 5,
+// CHECK-NEXT:      "tokLen": 1
+// CHECK-NEXT:     }
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "name": "d",
+// CHECK-NEXT:    "mangledName": "d",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "int"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   },
+// CHECK-NEXT:   {
+// CHECK-NEXT:    "id": "0x{{.*}}",
+// CHECK-NEXT:    "kind": "VarDecl",
+// CHECK-NEXT:    "loc": {
+// CHECK-NEXT:     "offset": 170,
+// CHECK-NEXT:     "line": 12,
+// CHECK-NEXT:     "col": 5,
+// CHECK-NEXT:     "tokLen": 1
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "range": {
+// CHECK-NEXT:     "begin": {
+// CHECK-NEXT:      "offset": 166,
+// CHECK-NEXT:      "col": 1,
+// CHECK-NEXT:      "tokLen": 3
+// CHECK-NEXT:     },
+// CHECK-NEXT:     "end": {
+// CHECK-NEXT:      "offset": 170,
+// CHECK-NEXT:      "col": 5,
+// CHECK-NEXT:      "tokLen": 1
+// CHECK-NEXT:     }
+// CHECK-NEXT:    },
+// CHECK-NEXT:    "name": "e",
+// CHECK-NEXT:    "mangledName": "e",
+// CHECK-NEXT:    "type": {
+// CHECK-NEXT:     "qualType": "int"
+// CHECK-NEXT:    }
+// CHECK-NEXT:   }
+// CHECK-NEXT:  ]
+// CHECK-NEXT: }
diff --git a/clang/test/AST/gen_ast_dump_json_test.py b/clang/test/AST/gen_ast_dump_json_test.py
index 3a4064699657d..f783c79faef86 100644
--- a/clang/test/AST/gen_ast_dump_json_test.py
+++ b/clang/test/AST/gen_ast_dump_json_test.py
@@ -180,7 +180,7 @@ def process_file(source_file, clang_binary, cmdline_filters, cmdline_opts,
         
             filter_json(j, filters, out_asts)
         
-    with tempfile.NamedTemporaryFile("w") as f:
+    with tempfile.NamedTemporaryFile("wb", delete=False) as f:
         with open(source_file, "r") as srcf:
             for line in srcf.readlines():
                 # copy up to the note:
@@ -201,6 +201,7 @@ def process_file(source_file, clang_binary, cmdline_filters, cmdline_opts,
                     
             f.write(out_str)
         f.flush()
+        f.close()
         if do_update:
             print("Updating json appended source file to %s." % source_file)
             copyfile(f.name, source_file)
@@ -209,6 +210,7 @@ def process_file(source_file, clang_binary, cmdline_filters, cmdline_opts,
             dest_path = '%s-json%s%s' % (partition[0], partition[1], partition[2])
             print("Writing json appended source file to %s." % dest_path)
             copyfile(f.name, dest_path)
+        os.remove(f.name)
     return 0
 
 
diff --git a/clang/test/AST/language_address_space_attribute.cpp b/clang/test/AST/language_address_space_attribute.cpp
index 8e098d6ce228f..7c6bdca06c06a 100644
--- a/clang/test/AST/language_address_space_attribute.cpp
+++ b/clang/test/AST/language_address_space_attribute.cpp
@@ -5,32 +5,32 @@
 
 void langas() {
   // CHECK: VarDecl {{.*}} x_global '__global int *'
-  __attribute__((ocl_global)) int *x_global;
+  __attribute__((opencl_global)) int *x_global;
 
   // CHECK: VarDecl {{.*}} z_global '__global int *'
-  [[clang::ocl_global]] int *z_global;
+  [[clang::opencl_global]] int *z_global;
 
   // CHECK: VarDecl {{.*}} x_local '__local int *'
-  __attribute__((ocl_local)) int *x_local;
+  __attribute__((opencl_local)) int *x_local;
 
   // CHECK: VarDecl {{.*}} z_local '__local int *'
-  [[clang::ocl_local]] int *z_local;
+  [[clang::opencl_local]] int *z_local;
 
   // CHECK: VarDecl {{.*}} x_constant '__constant int *'
-  __attribute__((ocl_constant)) int *x_constant;
+  __attribute__((opencl_constant)) int *x_constant;
 
   // CHECK: VarDecl {{.*}} z_constant '__constant int *'
-  [[clang::ocl_constant]] int *z_constant;
+  [[clang::opencl_constant]] int *z_constant;
 
   // CHECK: VarDecl {{.*}} x_private 'int *'
-  __attribute__((ocl_private)) int *x_private;
+  __attribute__((opencl_private)) int *x_private;
 
   // CHECK: VarDecl {{.*}} z_private 'int *'
-  [[clang::ocl_private]] int *z_private;
+  [[clang::opencl_private]] int *z_private;
 
   // CHECK: VarDecl {{.*}} x_generic '__generic int *'
-  __attribute__((ocl_generic)) int *x_generic;
+  __attribute__((opencl_generic)) int *x_generic;
 
   // CHECK: VarDecl {{.*}} z_generic '__generic int *'
-  [[clang::ocl_generic]] int *z_generic;
+  [[clang::opencl_generic]] int *z_generic;
 }
diff --git a/clang/test/Analysis/properties.m b/clang/test/Analysis/properties.m
index 2f427f2751820..d83b8ed14f93c 100644
--- a/clang/test/Analysis/properties.m
+++ b/clang/test/Analysis/properties.m
@@ -1049,6 +1049,8 @@ - (NSObject *)getShadowedIvar;
 - (void)clearShadowedIvar;
 - (NSObject *)getShadowedProp;
 - (void)clearShadowedProp;
+
+@property (assign) NSObject *o2;
 @end
 
 @implementation Shadowed
@@ -1078,7 +1080,7 @@ @implementation Shadowing
 @synthesize o;
 
 -(void)testPropertyShadowing {
-  NSObject *oo = self.o;
+  NSObject *oo = self.o; // no-crash
   clang_analyzer_eval(self.o == oo); // expected-warning{{TRUE}}
   clang_analyzer_eval([self getShadowedIvar] == oo); // expected-warning{{UNKNOWN}}
   [self clearShadowedIvar];
@@ -1086,4 +1088,10 @@ -(void)testPropertyShadowing {
   clang_analyzer_eval([self getShadowedIvar] == oo); // expected-warning{{UNKNOWN}}
   clang_analyzer_eval([self getShadowedIvar] == nil); // expected-warning{{TRUE}}
 }
+
+@synthesize o2 = ooo2;
+
+-(void)testPropertyShadowingWithExplicitIvar {
+  NSObject *oo2 = self.o2; // no-crash
+}
 @end
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index 61a97493f5351..e1db6780d90a2 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -20,6 +20,7 @@ extern void clearerr(FILE *stream);
 extern int feof(FILE *stream);
 extern int ferror(FILE *stream);
 extern int fileno(FILE *stream);
+extern FILE *freopen(const char *pathname, const char *mode, FILE *stream);
 
 void check_fread() {
   FILE *fp = tmpfile();
@@ -111,6 +112,13 @@ void f_double_close(void) {
   fclose(p); // expected-warning {{Try to close a file Descriptor already closed. Cause undefined behaviour}}
 }
 
+void f_double_close_alias(void) {
+  FILE *p1 = fopen("foo", "r");
+  FILE *p2 = p1;
+  fclose(p1);
+  fclose(p2); // expected-warning {{Try to close a file Descriptor already closed. Cause undefined behaviour}}
+}
+
 void f_leak(int c) {
   FILE *p = fopen("foo.c", "r");
   if(c)
@@ -134,3 +142,37 @@ void pr7831(FILE *fp) {
 void pr8081(FILE *stream, long offset, int whence) {
   fseek(stream, offset, whence);
 }
+
+void check_freopen_1() {
+  FILE *f1 = freopen("foo.c", "r", (FILE *)0); // expected-warning {{Stream pointer might be NULL}}
+  f1 = freopen(0, "w", (FILE *)0x123456);      // Do not report this as error.
+}
+
+void check_freopen_2() {
+  FILE *f1 = fopen("foo.c", "r");
+  if (f1) {
+    FILE *f2 = freopen(0, "w", f1);
+    if (f2) {
+      // Check if f1 and f2 point to the same stream.
+      fclose(f1);
+      fclose(f2); // expected-warning {{Try to close a file Descriptor already closed. Cause undefined behaviour}}
+    } else {
+      // Reopen failed.
+      // f1 points now to a possibly invalid stream but this condition is currently not checked.
+      // f2 is NULL.
+      rewind(f1);
+      rewind(f2); // expected-warning {{Stream pointer might be NULL}}
+    }
+  }
+}
+
+void check_freopen_3() {
+  FILE *f1 = fopen("foo.c", "r");
+  if (f1) {
+    // Unchecked result of freopen.
+    // The f1 may be invalid after this call (not checked by the checker).
+    freopen(0, "w", f1);
+    rewind(f1);
+    fclose(f1);
+  }
+}
diff --git a/clang/test/ClangScanDeps/Inputs/has_include_if_elif.json b/clang/test/ClangScanDeps/Inputs/has_include_if_elif.json
new file mode 100644
index 0000000000000..36ca006b03297
--- /dev/null
+++ b/clang/test/ClangScanDeps/Inputs/has_include_if_elif.json
@@ -0,0 +1,7 @@
+[
+{
+  "directory": "DIR",
+  "command": "clang -E DIR/has_include_if_elif2.cpp -IInputs",
+  "file": "DIR/has_include_if_elif2.cpp"
+}
+]
diff --git a/clang/test/ClangScanDeps/has_include_if_elif.cpp b/clang/test/ClangScanDeps/has_include_if_elif.cpp
new file mode 100644
index 0000000000000..dd56ecac69dbd
--- /dev/null
+++ b/clang/test/ClangScanDeps/has_include_if_elif.cpp
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t.dir
+// RUN: rm -rf %t.cdb
+// RUN: mkdir -p %t.dir
+// RUN: cp %s %t.dir/has_include_if_elif2.cpp
+// RUN: mkdir %t.dir/Inputs
+// RUN: cp %S/Inputs/header.h %t.dir/Inputs/header.h
+// RUN: cp %S/Inputs/header.h %t.dir/Inputs/header2.h
+// RUN: cp %S/Inputs/header.h %t.dir/Inputs/header3.h
+// RUN: cp %S/Inputs/header.h %t.dir/Inputs/header4.h
+// RUN: sed -e "s|DIR|%/t.dir|g" %S/Inputs/has_include_if_elif.json > %t.cdb
+//
+// RUN: clang-scan-deps -compilation-database %t.cdb -j 1 -mode preprocess-minimized-sources | \
+// RUN:   FileCheck %s
+// RUN: clang-scan-deps -compilation-database %t.cdb -j 1 -mode preprocess | \
+// RUN:   FileCheck %s
+
+#if __has_include("header.h")
+#endif
+
+#if 0
+#elif __has_include("header2.h")
+#endif
+
+#define H3 __has_include("header3.h")
+#if H3
+#endif
+
+#define H4 __has_include("header4.h")
+
+#if 0
+#elif H4
+#endif
+
+// CHECK: has_include_if_elif2.cpp
+// CHECK-NEXT: Inputs{{/|\\}}header.h
+// CHECK-NEXT: Inputs{{/|\\}}header2.h
+// CHECK-NEXT: Inputs{{/|\\}}header3.h
+// CHECK-NEXT: Inputs{{/|\\}}header4.h
diff --git a/clang/test/CodeGen/aarch64-neon-intrinsics.c b/clang/test/CodeGen/aarch64-neon-intrinsics.c
index b29d877dd8eca..7744b4f4a159d 100644
--- a/clang/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/clang/test/CodeGen/aarch64-neon-intrinsics.c
@@ -17756,8 +17756,6 @@ float32_t test_vminnmv_f32(float32x2_t a) {
 }
 
 // CHECK-LABEL: @test_vpaddq_s64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
@@ -17766,8 +17764,6 @@ int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
 }
 
 // CHECK-LABEL: @test_vpaddq_u64(
-// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
-// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
 // CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VPADDQ_V2_I]]
diff --git a/clang/test/CodeGen/aarch64-neon-vcadd.c b/clang/test/CodeGen/aarch64-neon-vcadd.c
new file mode 100644
index 0000000000000..2d721f187fe62
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-neon-vcadd.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon \
+// RUN:  -target-feature +v8.3a -target-feature +fullfp16 -S -emit-llvm -o - %s \
+// RUN:  | FileCheck %s
+
+#include <arm_neon.h>
+
+void foo16x4_rot90(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16
+  float16x4_t result = vcadd_rot90_f16(a, b);
+}
+
+void foo32x2_rot90(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32
+  float32x2_t result = vcadd_rot90_f32(a, b);
+}
+
+void foo16x8_rot90(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16
+  float16x8_t result = vcaddq_rot90_f16(a, b);
+}
+
+void foo32x4_rot90(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32
+  float32x4_t result = vcaddq_rot90_f32(a, b);
+}
+
+void foo64x2_rot90(float64x2_t a, float64x2_t b)
+{
+// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64
+  float64x2_t result = vcaddq_rot90_f64(a, b);
+}
+
+void foo16x4_rot270(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16
+  float16x4_t result = vcadd_rot270_f16(a, b);
+}
+
+void foo32x2_rot270(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32
+  float32x2_t result = vcadd_rot270_f32(a, b);
+}
+
+void foo16x8_rot270(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16
+  float16x8_t result = vcaddq_rot270_f16(a, b);
+}
+
+void foo32x4_rot270(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32
+  float32x4_t result = vcaddq_rot270_f32(a, b);
+}
+
+void foo64x2_rot270(float64x2_t a, float64x2_t b)
+{
+// CHECK: call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64
+  float64x2_t result = vcaddq_rot270_f64(a, b);
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/predicates.c b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
new file mode 100644
index 0000000000000..5761849d094be
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/predicates.c
@@ -0,0 +1,290 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa -early-cse | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vctp16q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp16q(uint32_t a)
+{
+    return vctp16q(a);
+}
+
+// CHECK-LABEL: @test_vctp16q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp16q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp16q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp32q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp32q(uint32_t a)
+{
+    return vctp32q(a);
+}
+
+// CHECK-LABEL: @test_vctp32q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp32q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp32q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp64q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp64q(uint32_t a)
+{
+    return vctp64q(a);
+}
+
+// CHECK-LABEL: @test_vctp64q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp64(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp64q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp64q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vctp8q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+// CHECK-NEXT:    ret i16 [[TMP2]]
+//
+mve_pred16_t test_vctp8q(uint32_t a)
+{
+    return vctp8q(a);
+}
+
+// CHECK-LABEL: @test_vctp8q_m(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[A:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i1> [[TMP1]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+// CHECK-NEXT:    ret i16 [[TMP5]]
+//
+mve_pred16_t test_vctp8q_m(uint32_t a, mve_pred16_t p)
+{
+    return vctp8q_m(a, p);
+}
+
+// CHECK-LABEL: @test_vpnot(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor i16 [[A:%.*]], -1
+// CHECK-NEXT:    ret i16 [[TMP0]]
+//
+mve_pred16_t test_vpnot(mve_pred16_t a)
+{
+    return vpnot(a);
+}
+
+// CHECK-LABEL: @test_vpselq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x half> [[A:%.*]], <8 x half> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vpselq_f16(float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vpselq_f32(float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_f32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vpselq_s16(int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vpselq_s32(int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+int64x2_t test_vpselq_s64(int64x2_t a, int64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vpselq_s8(int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_s8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vpselq_u16(uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u16(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vpselq_u32(uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u32(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]]
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+//
+uint64x2_t test_vpselq_u64(uint64x2_t a, uint64x2_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u64(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vpselq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = select <16 x i1> [[TMP1]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vpselq_u8(uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vpselq(a, b, p);
+#else /* POLYMORPHIC */
+    return vpselq_u8(a, b, p);
+#endif /* POLYMORPHIC */
+}
+
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
new file mode 100644
index 0000000000000..a416bfb773e6b
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vabdq.c
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vabdq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vabd.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vabdq(a, b);
+#else /* POLYMORPHIC */
+    return vabdq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vabdq(a, b);
+#else /* POLYMORPHIC */
+    return vabdq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vabd.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vabdq_f32(float16x8_t a, float16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vabdq(a, b);
+#else /* POLYMORPHIC */
+    return vabdq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.abd.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vabdq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vabdq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vabdq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.abd.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vabdq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vabdq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vabdq_m_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vandq.c b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
new file mode 100644
index 0000000000000..aeab8b7063ece
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vandq.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vandq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vandq(a, b);
+#else /* POLYMORPHIC */
+    return vandq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vandq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vandq(a, b);
+#else /* POLYMORPHIC */
+    return vandq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.and.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vandq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vandq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i1> [[TMP3]], <8 x i16> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
+float16x8_t test_vandq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vandq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vandq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
new file mode 100644
index 0000000000000..3106b40a322d1
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vbicq.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vbicq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, b);
+#else /* POLYMORPHIC */
+    return vbicq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[TMP3:%.*]] = and <4 x i32> [[TMP0]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float32x4_t test_vbicq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vbicq(a, b);
+#else /* POLYMORPHIC */
+    return vbicq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.bic.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vbicq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vbicq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i1> [[TMP3]], <8 x i16> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
+float16x8_t test_vbicq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vbicq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vbicq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/veorq.c b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
new file mode 100644
index 0000000000000..c271568f791f3
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/veorq.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_veorq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return veorq(a, b);
+#else /* POLYMORPHIC */
+    return veorq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_veorq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return veorq(a, b);
+#else /* POLYMORPHIC */
+    return veorq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.eor.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_veorq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_veorq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i1> [[TMP3]], <8 x i16> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
+float16x8_t test_veorq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return veorq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return veorq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
new file mode 100644
index 0000000000000..63300466c819e
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxnmq.c
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmaxnmq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vmaxnmq_f16(float16x8_t a, float16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq(a, b);
+#else /* POLYMORPHIC */
+    return vmaxnmq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq(a, b);
+#else /* POLYMORPHIC */
+    return vmaxnmq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vmaxnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxnmq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxnmq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vmaxnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxnmq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxnmq_m_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
new file mode 100644
index 0000000000000..133e28d6cf047
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmaxq.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmaxq_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <16 x i1> [[TMP0]], <16 x i8> [[B]], <16 x i8> [[A]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vmaxq(a, b);
+#else /* POLYMORPHIC */
+    return vmaxq_s8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ult <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[B]], <8 x i16> [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmaxq(a, b);
+#else /* POLYMORPHIC */
+    return vmaxq_u16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp slt <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[B]], <4 x i32> [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmaxq(a, b);
+#else /* POLYMORPHIC */
+    return vmaxq_s32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_m_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.max.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+uint8x16_t test_vmaxq_m_u8(uint8x16_t inactive, uint8x16_t a, uint8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_m_u8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_m_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.max.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+int16x8_t test_vmaxq_m_s16(int16x8_t inactive, int16x8_t a, int16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_m_s16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmaxq_m_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.max.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+uint32x4_t test_vmaxq_m_u32(uint32x4_t inactive, uint32x4_t a, uint32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmaxq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmaxq_m_u32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
new file mode 100644
index 0000000000000..9ed5bf0c859be
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminnmq.c
@@ -0,0 +1,65 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vminnmq_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vminnmq_f16(float16x8_t a, float16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vminnmq(a, b);
+#else /* POLYMORPHIC */
+    return vminnmq_f16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vminnmq(a, b);
+#else /* POLYMORPHIC */
+    return vminnmq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vminnmq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminnmq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vminnmq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminnmq_m_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]], <4 x float> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP2]]
+//
+float32x4_t test_vminnmq_m_f32(float32x4_t inactive, float32x4_t a, float32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminnmq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vminnmq_m_f32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
new file mode 100644
index 0000000000000..9e54eaeb5d839
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vminq.c
@@ -0,0 +1,98 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O3 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vminq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <16 x i1> [[TMP0]], <16 x i8> [[B]], <16 x i8> [[A]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+//
+uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vminq(a, b);
+#else /* POLYMORPHIC */
+    return vminq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <8 x i1> [[TMP0]], <8 x i16> [[B]], <8 x i16> [[A]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP1]]
+//
+int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vminq(a, b);
+#else /* POLYMORPHIC */
+    return vminq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[B]], <4 x i32> [[A]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vminq(a, b);
+#else /* POLYMORPHIC */
+    return vminq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vminq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vminq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vminq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vminq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vminq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vminq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
new file mode 100644
index 0000000000000..63696d698c503
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulhq.c
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmulhq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vmulhq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vmulhq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vmulhq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vmulhq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vmulhq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vmulhq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulhq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulhq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
new file mode 100644
index 0000000000000..ac457cba81ebc
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vmulq.c
@@ -0,0 +1,125 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vmulq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <16 x i8> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <8 x i16> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = fmul <4 x float> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vmulq(a, b);
+#else /* POLYMORPHIC */
+    return vmulq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vmulq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vmulq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vmulq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vmulq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x i1> [[TMP1]], <8 x half> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x half> [[TMP2]]
+//
+float16x8_t test_vmulq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vmulq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vmulq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vornq.c b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
new file mode 100644
index 0000000000000..753a6ddf2ee17
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vornq.c
@@ -0,0 +1,74 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vornq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[A:%.*]], [[TMP0]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vornq(a, b);
+#else /* POLYMORPHIC */
+    return vornq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[TMP3:%.*]] = or <4 x i32> [[TMP0]], [[TMP2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP4]]
+//
+float32x4_t test_vornq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vornq(a, b);
+#else /* POLYMORPHIC */
+    return vornq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.orn.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vornq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vornq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i1> [[TMP3]], <8 x i16> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
+float16x8_t test_vornq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vornq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vornq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
new file mode 100644
index 0000000000000..436f6277e073f
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vorrq.c
@@ -0,0 +1,72 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vorrq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], [[B:%.*]]
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, b);
+#else /* POLYMORPHIC */
+    return vorrq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+// CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP3]]
+//
+float32x4_t test_vorrq_f32(float32x4_t a, float32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vorrq(a, b);
+#else /* POLYMORPHIC */
+    return vorrq_f32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vorrq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vorrq_m_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[A:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[B:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x half> [[INACTIVE:%.*]] to <8 x i16>
+// CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <8 x i1> [[TMP3]], <8 x i16> [[TMP4]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP6]]
+//
+float16x8_t test_vorrq_m_f16(float16x8_t inactive, float16x8_t a, float16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vorrq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vorrq_m_f16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
new file mode 100644
index 0000000000000..2c8148405585b
--- /dev/null
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vrmulhq.c
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <arm_mve.h>
+
+// CHECK-LABEL: @test_vrmulhq_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+uint8x16_t test_vrmulhq_u8(uint8x16_t a, uint8x16_t b)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vrmulhq_u8(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vrmulhq_s16(int16x8_t a, int16x8_t b)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vrmulhq_s16(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vrmulhq_u32(uint32x4_t a, uint32x4_t b)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq(a, b);
+#else /* POLYMORPHIC */
+    return vrmulhq_u32(a, b);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_m_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP1]], <16 x i8> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP2]]
+//
+int8x16_t test_vrmulhq_m_s8(int8x16_t inactive, int8x16_t a, int8x16_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_m_s8(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_m_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP1]], <8 x i16> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP2]]
+//
+uint16x8_t test_vrmulhq_m_u16(uint16x8_t inactive, uint16x8_t a, uint16x8_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_m_u16(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
+
+// CHECK-LABEL: @test_vrmulhq_m_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> [[INACTIVE:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+//
+int32x4_t test_vrmulhq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, mve_pred16_t p)
+{
+#ifdef POLYMORPHIC
+    return vrmulhq_m(inactive, a, b, p);
+#else /* POLYMORPHIC */
+    return vrmulhq_m_s32(inactive, a, b, p);
+#endif /* POLYMORPHIC */
+}
diff --git a/clang/test/CodeGen/arm-neon-vcadd.c b/clang/test/CodeGen/arm-neon-vcadd.c
new file mode 100644
index 0000000000000..00b4641b5a087
--- /dev/null
+++ b/clang/test/CodeGen/arm-neon-vcadd.c
@@ -0,0 +1,54 @@
+// REQUIRES: arm-registered-target
+// RUN: %clang_cc1 -triple armv8.3a-arm-none-eabi -target-cpu generic \
+// RUN: -target-feature +fullfp16 -mfloat-abi soft -S -emit-llvm -o - %s | \
+// RUN: opt -S -sroa -o - | FileCheck %s
+
+#include <arm_neon.h>
+
+void foo16x4_rot90(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16
+  float16x4_t result = vcadd_rot90_f16(a, b);
+}
+
+void foo32x2_rot90(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32
+  float32x2_t result = vcadd_rot90_f32(a, b);
+}
+
+void foo16x8_rot90(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16
+  float16x8_t result = vcaddq_rot90_f16(a, b);
+}
+
+void foo32x4_rot90(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32
+  float32x4_t result = vcaddq_rot90_f32(a, b);
+}
+
+void foo16x4_rot270(float16x4_t a, float16x4_t b)
+{
+// CHECK: call <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16
+  float16x4_t result = vcadd_rot270_f16(a, b);
+}
+
+void foo32x2_rot270(float32x2_t a, float32x2_t b)
+{
+// CHECK: call <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32
+  float32x2_t result = vcadd_rot270_f32(a, b);
+}
+
+void foo16x8_rot270(float16x8_t a, float16x8_t b)
+{
+// CHECK: call <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16
+  float16x8_t result = vcaddq_rot270_f16(a, b);
+}
+
+void foo32x4_rot270(float32x4_t a, float32x4_t b)
+{
+// CHECK: call <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32
+  float32x4_t result = vcaddq_rot270_f32(a, b);
+}
diff --git a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
index 6f5867b6c11f7..5462c17a1cc50 100644
--- a/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
+++ b/clang/test/CodeGen/arm-v8.1a-neon-intrinsics.c
@@ -13,7 +13,7 @@
 // CHECK-LABEL: test_vqrdmlah_s16
 int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
+// CHECK-ARM: call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
@@ -23,7 +23,7 @@ int16x4_t test_vqrdmlah_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: test_vqrdmlah_s32
 int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
+// CHECK-ARM: call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
@@ -33,7 +33,7 @@ int32x2_t test_vqrdmlah_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: test_vqrdmlahq_s16
 int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 // CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
+// CHECK-ARM: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
@@ -43,7 +43,7 @@ int16x8_t test_vqrdmlahq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 // CHECK-LABEL: test_vqrdmlahq_s32
 int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
+// CHECK-ARM: call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
@@ -54,7 +54,7 @@ int32x4_t test_vqrdmlahq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
+// CHECK-ARM: call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
@@ -66,7 +66,7 @@ int16x4_t test_vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
 // CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
+// CHECK-ARM: call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
@@ -78,7 +78,7 @@ int32x2_t test_vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
+// CHECK-ARM: call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
@@ -90,7 +90,7 @@ int16x8_t test_vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 // CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
+// CHECK-ARM: call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
@@ -101,7 +101,7 @@ int32x4_t test_vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 // CHECK-LABEL: test_vqrdmlsh_s16
 int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
+// CHECK-ARM: call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
@@ -111,7 +111,7 @@ int16x4_t test_vqrdmlsh_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-LABEL: test_vqrdmlsh_s32
 int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
+// CHECK-ARM: call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
@@ -121,7 +121,7 @@ int32x2_t test_vqrdmlsh_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-LABEL: test_vqrdmlshq_s16
 int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 // CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
+// CHECK-ARM: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
@@ -131,7 +131,7 @@ int16x8_t test_vqrdmlshq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
 // CHECK-LABEL: test_vqrdmlshq_s32
 int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 // CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
+// CHECK-ARM: call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
@@ -142,7 +142,7 @@ int32x4_t test_vqrdmlshq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
 int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
-// CHECK-ARM: call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
+// CHECK-ARM: call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
 // CHECK-AARCH64: call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> {{%.*}}, <4 x i16> {{%.*}})
@@ -154,7 +154,7 @@ int16x4_t test_vqrdmlsh_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
 int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
 // CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
-// CHECK-ARM: call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
+// CHECK-ARM: call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> <i32 1, i32 1>
 // CHECK-AARCH64: call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> {{%.*}}, <2 x i32> {{%.*}})
@@ -166,7 +166,7 @@ int32x2_t test_vqrdmlsh_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
 int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 // CHECK-ARM: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
-// CHECK-ARM: call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
+// CHECK-ARM: call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
 // CHECK-AARCH64: call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> {{%.*}}, <8 x i16> {{%.*}})
@@ -178,7 +178,7 @@ int16x8_t test_vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
 int32x4_t test_vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
 // CHECK-ARM: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
-// CHECK-ARM: call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
+// CHECK-ARM: call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
 
 // CHECK-AARCH64: shufflevector <2 x i32> {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 // CHECK-AARCH64: call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> {{%.*}}, <4 x i32> {{%.*}})
diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c
index 5c34d1c37de09..9f1a64554155c 100644
--- a/clang/test/CodeGen/arm_neon_intrinsics.c
+++ b/clang/test/CodeGen/arm_neon_intrinsics.c
@@ -2147,6 +2147,13 @@ int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
+// CHECK-LABEL: @test_vcreate_imm
+// CHECK: [[RES:%.*]] = bitcast i64 0 to <4 x i16>
+// CHECK: ret <4 x i16> [[RES]]
+int16x4_t test_vcreate_imm(void) {
+  return vcreate_s16(0);
+}
+
 // CHECK-LABEL: @test_vcreate_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
@@ -9523,7 +9530,7 @@ int32x4_t test_vqabsq_s32(int32x4_t a) {
 }
 
 // CHECK-LABEL: @test_vqadd_s8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
@@ -9532,7 +9539,7 @@ int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: @test_vqadd_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
@@ -9542,7 +9549,7 @@ int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: @test_vqadd_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
@@ -9552,7 +9559,7 @@ int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: @test_vqadd_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
@@ -9560,7 +9567,7 @@ int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
 }
 
 // CHECK-LABEL: @test_vqadd_u8(
-// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
 // CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
@@ -9569,7 +9576,7 @@ uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: @test_vqadd_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQADD_V2_I]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
@@ -9579,7 +9586,7 @@ uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: @test_vqadd_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQADD_V2_I]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
@@ -9589,7 +9596,7 @@ uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: @test_vqadd_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
 // CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
 // CHECK:   ret <1 x i64> [[VQADD_V2_I]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
@@ -9597,7 +9604,7 @@ uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
 }
 
 // CHECK-LABEL: @test_vqaddq_s8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
@@ -9606,7 +9613,7 @@ int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: @test_vqaddq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
@@ -9616,7 +9623,7 @@ int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: @test_vqaddq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
@@ -9626,7 +9633,7 @@ int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: @test_vqaddq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
@@ -9634,7 +9641,7 @@ int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
 }
 
 // CHECK-LABEL: @test_vqaddq_u8(
-// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
 // CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
@@ -9643,7 +9650,7 @@ uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: @test_vqaddq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQADDQ_V2_I]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
@@ -9653,7 +9660,7 @@ uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: @test_vqaddq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQADDQ_V2_I]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
@@ -9663,7 +9670,7 @@ uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: @test_vqaddq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VQADDQ_V2_I]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
@@ -9675,7 +9682,7 @@ uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
@@ -9686,7 +9693,7 @@ int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
@@ -9698,7 +9705,7 @@ int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
 // CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
@@ -9710,7 +9717,7 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
 // CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
@@ -9725,7 +9732,7 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
 // CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
@@ -9738,7 +9745,7 @@ int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
 // CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
@@ -9749,7 +9756,7 @@ int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
@@ -9760,7 +9767,7 @@ int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> %c)
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
@@ -9772,7 +9779,7 @@ int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]])
 // CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
@@ -9784,7 +9791,7 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
 // CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]])
-// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]])
 // CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
@@ -9799,7 +9806,7 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
 // CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[VECINIT3_I]])
-// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL5_I]])
 // CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
@@ -9812,7 +9819,7 @@ int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
 // CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[VECINIT1_I]])
-// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL3_I]])
 // CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
@@ -10961,7 +10968,7 @@ uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
 }
 
 // CHECK-LABEL: @test_vqsub_s8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
@@ -10970,7 +10977,7 @@ int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
 // CHECK-LABEL: @test_vqsub_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
@@ -10980,7 +10987,7 @@ int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
 // CHECK-LABEL: @test_vqsub_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
@@ -10990,7 +10997,7 @@ int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
 // CHECK-LABEL: @test_vqsub_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
@@ -10998,7 +11005,7 @@ int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
 }
 
 // CHECK-LABEL: @test_vqsub_u8(
-// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b)
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %a, <8 x i8> %b)
 // CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
@@ -11007,7 +11014,7 @@ uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
 // CHECK-LABEL: @test_vqsub_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %a, <4 x i16> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %a, <4 x i16> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <4 x i16> [[VQSUB_V2_I]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
@@ -11017,7 +11024,7 @@ uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
 // CHECK-LABEL: @test_vqsub_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %a, <2 x i32> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %a, <2 x i32> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x i32> [[VQSUB_V2_I]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
@@ -11027,7 +11034,7 @@ uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: @test_vqsub_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
-// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %a, <1 x i64> %b)
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %a, <1 x i64> %b)
 // CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
 // CHECK:   ret <1 x i64> [[VQSUB_V2_I]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
@@ -11035,7 +11042,7 @@ uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
 }
 
 // CHECK-LABEL: @test_vqsubq_s8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
@@ -11044,7 +11051,7 @@ int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
 // CHECK-LABEL: @test_vqsubq_s16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
@@ -11054,7 +11061,7 @@ int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
 // CHECK-LABEL: @test_vqsubq_s32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
@@ -11064,7 +11071,7 @@ int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
 // CHECK-LABEL: @test_vqsubq_s64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
@@ -11072,7 +11079,7 @@ int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
 }
 
 // CHECK-LABEL: @test_vqsubq_u8(
-// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b)
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %a, <16 x i8> %b)
 // CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
@@ -11081,7 +11088,7 @@ uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK-LABEL: @test_vqsubq_u16(
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %a, <8 x i16> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %a, <8 x i16> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <8 x i16> [[VQSUBQ_V2_I]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
@@ -11091,7 +11098,7 @@ uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK-LABEL: @test_vqsubq_u32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %a, <4 x i32> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %a, <4 x i32> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x i32> [[VQSUBQ_V2_I]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
@@ -11101,7 +11108,7 @@ uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: @test_vqsubq_u64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
-// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %a, <2 x i64> %b)
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %a, <2 x i64> %b)
 // CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x i64> [[VQSUBQ_V2_I]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
diff --git a/clang/test/CodeGen/builtins-mips-args.c b/clang/test/CodeGen/builtins-mips-args.c
index cdb42af4a53d1..a135848805aaf 100644
--- a/clang/test/CodeGen/builtins-mips-args.c
+++ b/clang/test/CodeGen/builtins-mips-args.c
@@ -1,5 +1,6 @@
 // REQUIRES: mips-registered-target
-// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -target-feature +dspr2 \
+// RUN:            -fsyntax-only -verify %s
 
 void foo() {
   // MIPS DSP Rev 1
diff --git a/clang/test/CodeGen/builtins-mips.c b/clang/test/CodeGen/builtins-mips.c
index c6be896e81928..d26f630c35d7d 100644
--- a/clang/test/CodeGen/builtins-mips.c
+++ b/clang/test/CodeGen/builtins-mips.c
@@ -1,5 +1,6 @@
 // REQUIRES: mips-registered-target
-// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -emit-llvm %s -o - \
+// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -emit-llvm %s \
+// RUN:            -target-feature +dspr2 -o - \
 // RUN:   | FileCheck %s
 
 typedef int q31;
diff --git a/clang/test/CodeGen/catch-implicit-conversions-basics-negatives.c b/clang/test/CodeGen/catch-implicit-conversions-basics-negatives.c
new file mode 100644
index 0000000000000..2e060cfcddef3
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-conversions-basics-negatives.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fsanitize=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -fsanitize-recover=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// If we have an enum, it will be promoted to an unsigned integer.
+// But both types are unsigned, and have same bitwidth.
+// So we should not emit any sanitization. Also, for inc/dec we currently assume
+// (assert) that we will only have cases where at least one of the types
+// is signed, which isn't the case here.
+typedef enum { a } b;
+b t0(b c) {
+  c--;
+  return c;
+}
diff --git a/clang/test/CodeGen/catch-implicit-conversions-incdec-basics.c b/clang/test/CodeGen/catch-implicit-conversions-incdec-basics.c
new file mode 100644
index 0000000000000..e97a72cb0a339
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-conversions-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -fsanitize-recover=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-integer-arithmetic-value-change-incdec-basics.c b/clang/test/CodeGen/catch-implicit-integer-arithmetic-value-change-incdec-basics.c
new file mode 100644
index 0000000000000..5e0aa1108dfc9
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-integer-arithmetic-value-change-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-signed-integer-truncation,implicit-integer-sign-change -fsanitize-recover=implicit-signed-integer-truncation,implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-integer-conversions-incdec-basics.c b/clang/test/CodeGen/catch-implicit-integer-conversions-incdec-basics.c
new file mode 100644
index 0000000000000..e97a72cb0a339
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-integer-conversions-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -fsanitize-recover=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec-basics.c b/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec-basics.c
new file mode 100644
index 0000000000000..93495b331b9f8
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-integer-sign-change -fsanitize-recover=implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 3 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 3 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec.c b/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec.c
new file mode 100644
index 0000000000000..41e08ee32a525
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-integer-sign-changes-incdec.c
@@ -0,0 +1,307 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-NOSANITIZE
+
+// RUN: %clang_cc1 -fsanitize=implicit-integer-sign-change -fno-sanitize-recover=implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-ANYRECOVER,CHECK-SANITIZE-NORECOVER,CHECK-SANITIZE-UNREACHABLE
+// RUN: %clang_cc1 -fsanitize=implicit-integer-sign-change -fsanitize-recover=implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-ANYRECOVER,CHECK-SANITIZE-RECOVER
+// RUN: %clang_cc1 -fsanitize=implicit-integer-sign-change -fsanitize-trap=implicit-integer-sign-change -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-TRAP,CHECK-SANITIZE-UNREACHABLE
+
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 3 }
+
+unsigned short t0(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t0(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t0(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], false, !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 100
+  return x++;
+}
+unsigned short t1(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t1(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t1(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], false, !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 200
+  return x--;
+}
+
+unsigned short t2(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t2(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t2(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], false, !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 300
+  return ++x;
+}
+
+unsigned short t3(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t3(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t3(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], false, !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 400
+  return --x;
+}
+
+signed short t4(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t4(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t4(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[DST_NEGATIVITYCHECK:%.*]] = icmp slt i16 [[X_PROMOTED_DEMOTED]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], [[DST_NEGATIVITYCHECK]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 500
+  return x++;
+}
+signed short t5(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t5(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t5(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[DST_NEGATIVITYCHECK:%.*]] = icmp slt i16 [[X_PROMOTED_DEMOTED]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], [[DST_NEGATIVITYCHECK]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 600
+  return x--;
+}
+
+signed short t6(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t6(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t6(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[DST_NEGATIVITYCHECK:%.*]] = icmp slt i16 [[X_PROMOTED_DEMOTED]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], [[DST_NEGATIVITYCHECK]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 700
+  return ++x;
+}
+
+signed short t7(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t7(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t7(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[SRC_INC_NEGATIVITYCHECK:%.*]] = icmp slt i32 [[INC]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[DST_NEGATIVITYCHECK:%.*]] = icmp slt i16 [[X_PROMOTED_DEMOTED]], 0, !nosanitize !2
+// CHECK-SANITIZE-NEXT:               [[SIGNCHANGECHECK:%.*]] = icmp eq i1 [[SRC_INC_NEGATIVITYCHECK]], [[DST_NEGATIVITYCHECK]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[SIGNCHANGECHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 800
+  return --x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-integer-truncations-incdec-basics.c b/clang/test/CodeGen/catch-implicit-integer-truncations-incdec-basics.c
new file mode 100644
index 0000000000000..6ac2be6d9fd0c
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-integer-truncations-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation -fsanitize-recover=implicit-unsigned-integer-truncation,implicit-signed-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec-basics.c b/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec-basics.c
new file mode 100644
index 0000000000000..b7e438c7229ce
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec-basics.c
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsanitize=implicit-signed-integer-truncation -fsanitize-recover=implicit-signed-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-DAG: @[[UNSIGNED_CHAR:.*]] = {{.*}} c"'unsigned char'\00" }
+// CHECK-DAG: @[[LINE_900:.*]] = {{.*}}, i32 900, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1000:.*]] = {{.*}}, i32 1000, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1100:.*]] = {{.*}}, i32 1100, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1200:.*]] = {{.*}}, i32 1200, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[SIGNED_CHAR:.*]] = {{.*}} c"'signed char'\00" }
+// CHECK-DAG: @[[LINE_1300:.*]] = {{.*}}, i32 1300, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1400:.*]] = {{.*}}, i32 1400, i32 4 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1500:.*]] = {{.*}}, i32 1500, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+// CHECK-DAG: @[[LINE_1600:.*]] = {{.*}}, i32 1600, i32 3 }, {{.*}}* @[[INT]], {{.*}}* @[[SIGNED_CHAR]], i8 2 }
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*)
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*)
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*)
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*)
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*)
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*)
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*)
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*)
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_900]] to i8*)
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1000]] to i8*)
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1100]] to i8*)
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1200]] to i8*)
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1300]] to i8*)
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1400]] to i8*)
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1500]] to i8*)
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+  // CHECK: call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_1600]] to i8*)
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec.c b/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec.c
new file mode 100644
index 0000000000000..1e0bad1844c50
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-signed-integer-truncations-incdec.c
@@ -0,0 +1,303 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-NOSANITIZE
+
+// RUN: %clang_cc1 -fsanitize=implicit-signed-integer-truncation -fno-sanitize-recover=implicit-signed-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-ANYRECOVER,CHECK-SANITIZE-NORECOVER,CHECK-SANITIZE-UNREACHABLE
+// RUN: %clang_cc1 -fsanitize=implicit-signed-integer-truncation -fsanitize-recover=implicit-signed-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-ANYRECOVER,CHECK-SANITIZE-RECOVER
+// RUN: %clang_cc1 -fsanitize=implicit-signed-integer-truncation -fsanitize-trap=implicit-signed-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK,CHECK-SANITIZE,CHECK-SANITIZE-TRAP,CHECK-SANITIZE-UNREACHABLE
+
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[INT:.*]] = {{.*}} c"'int'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[UNSIGNED_SHORT:.*]] = {{.*}} c"'unsigned short'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_100:.*]] = {{.*}}, i32 100, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_200:.*]] = {{.*}}, i32 200, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_300:.*]] = {{.*}}, i32 300, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_400:.*]] = {{.*}}, i32 400, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[UNSIGNED_SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[SHORT:.*]] = {{.*}} c"'short'\00" }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_500:.*]] = {{.*}}, i32 500, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_600:.*]] = {{.*}}, i32 600, i32 11 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_700:.*]] = {{.*}}, i32 700, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+// CHECK-SANITIZE-ANYRECOVER-DAG: @[[LINE_800:.*]] = {{.*}}, i32 800, i32 10 }, {{.*}}* @[[INT]], {{.*}}* @[[SHORT]], i8 2 }
+
+unsigned short t0(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t0(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t0(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_100]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 100
+  return x++;
+}
+unsigned short t1(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t1(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t1(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_200]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 200
+  return x--;
+}
+
+unsigned short t2(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t2(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t2(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_300]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 300
+  return ++x;
+}
+
+unsigned short t3(unsigned short x) {
+// CHECK-NOSANITIZE-LABEL: @t3(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t3(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = zext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_400]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 400
+  return --x;
+}
+
+signed short t4(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t4(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t4(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = sext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_500]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 500
+  return x++;
+}
+signed short t5(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t5(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[X_RELOADED]]
+//
+// CHECK-SANITIZE-LABEL:            @t5(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = sext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_600]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_RELOADED]]
+#line 600
+  return x--;
+}
+
+signed short t6(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t6(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], 1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t6(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], 1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = sext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_700]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 700
+  return ++x;
+}
+
+signed short t7(signed short x) {
+// CHECK-NOSANITIZE-LABEL: @t7(
+// CHECK-NOSANITIZE-NEXT:  entry:
+// CHECK-NOSANITIZE-NEXT:    [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-NOSANITIZE-NEXT:    store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    [[INC:%.*]] = add i16 [[X_RELOADED]], -1
+// CHECK-NOSANITIZE-NEXT:    store i16 [[INC]], i16* [[X_ADDR]], align 2
+// CHECK-NOSANITIZE-NEXT:    ret i16 [[INC]]
+//
+// CHECK-SANITIZE-LABEL:            @t7(
+// CHECK-SANITIZE-NEXT:             entry:
+// CHECK-SANITIZE-NEXT:               [[X_ADDR:%.*]] = alloca i16, align 2
+// CHECK-SANITIZE-NEXT:               store i16 [[X:%.*]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_RELOADED:%.*]] = load i16, i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED:%.*]] = sext i16 [[X_RELOADED]] to i32
+// CHECK-SANITIZE-NEXT:               [[INC:%.*]] = add i32 [[X_PROMOTED]], -1
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED:%.*]] = trunc i32 [[INC]] to i16
+// CHECK-SANITIZE-NEXT:               [[X_PROMOTED_DEMOTED_PROMOTED:%.*]] = sext i16 [[X_PROMOTED_DEMOTED]] to i32, !nosanitize
+// CHECK-SANITIZE-NEXT:               [[TRUNCHECK:%.*]] = icmp eq i32 [[X_PROMOTED_DEMOTED_PROMOTED]], [[INC]], !nosanitize
+// CHECK-SANITIZE-NEXT:               br i1 [[TRUNCHECK]], label %[[CONT:.*]], label %[[HANDLER_IMPLICIT_X_PROMOTEDERSION:[^,]+]],{{.*}} !nosanitize
+// CHECK-SANITIZE:                  [[HANDLER_IMPLICIT_X_PROMOTEDERSION]]:
+// CHECK-SANITIZE-TRAP-NEXT:          call void @llvm.trap(){{.*}}, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP1:%.*]] = zext i32 [[INC]] to i64, !nosanitize
+// CHECK-SANITIZE-ANYRECOVER-NEXT:    [[TMP2:%.*]] = zext i16 [[X_PROMOTED_DEMOTED]] to i64, !nosanitize
+// CHECK-SANITIZE-NORECOVER-NEXT:     call void @__ubsan_handle_implicit_conversion_abort(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       call void @__ubsan_handle_implicit_conversion(i8* bitcast ({ {{{.*}}}, {{{.*}}}*, {{{.*}}}*, i8 }* @[[LINE_800]] to i8*), i64 [[TMP1]], i64 [[TMP2]]) #2, !nosanitize
+// CHECK-SANITIZE-UNREACHABLE-NEXT:   unreachable, !nosanitize
+// CHECK-SANITIZE-RECOVER-NEXT:       br label %[[CONT]], !nosanitize
+// CHECK-SANITIZE:                  [[CONT]]:
+// CHECK-SANITIZE-NEXT:               store i16 [[X_PROMOTED_DEMOTED]], i16* [[X_ADDR]], align 2
+// CHECK-SANITIZE-NEXT:               ret i16 [[X_PROMOTED_DEMOTED]]
+#line 800
+  return --x;
+}
diff --git a/clang/test/CodeGen/catch-implicit-unsigned-integer-truncations-incdec-basics.c b/clang/test/CodeGen/catch-implicit-unsigned-integer-truncations-incdec-basics.c
new file mode 100644
index 0000000000000..7ad12314f3df0
--- /dev/null
+++ b/clang/test/CodeGen/catch-implicit-unsigned-integer-truncations-incdec-basics.c
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -fsanitize=implicit-unsigned-integer-truncation -fsanitize-recover=implicit-unsigned-integer-truncation -emit-llvm %s -o - -triple x86_64-linux-gnu | FileCheck %s -implicit-check-not="call void @__ubsan_handle_implicit_conversion" --check-prefixes=CHECK
+
+// CHECK-LABEL: @t0(
+unsigned short t0(unsigned short x) {
+#line 100
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t1(
+unsigned short t1(unsigned short x) {
+#line 200
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t2(
+unsigned short t2(unsigned short x) {
+#line 300
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t3(
+unsigned short t3(unsigned short x) {
+#line 400
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t4(
+signed short t4(signed short x) {
+#line 500
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t5(
+signed short t5(signed short x) {
+#line 600
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t6(
+signed short t6(signed short x) {
+#line 700
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t7(
+signed short t7(signed short x) {
+#line 800
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t8(
+unsigned char t8(unsigned char x) {
+#line 900
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t9(
+unsigned char t9(unsigned char x) {
+#line 1000
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t10(
+unsigned char t10(unsigned char x) {
+#line 1100
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t11(
+unsigned char t11(unsigned char x) {
+#line 1200
+  --x;
+  return x;
+}
+
+// CHECK-LABEL: @t12(
+signed char t12(signed char x) {
+#line 1300
+  x++;
+  return x;
+}
+// CHECK-LABEL: @t13(
+signed char t13(signed char x) {
+#line 1400
+  x--;
+  return x;
+}
+// CHECK-LABEL: @t14(
+signed char t14(signed char x) {
+#line 1500
+  ++x;
+  return x;
+}
+// CHECK-LABEL: @t15(
+signed char t15(signed char x) {
+#line 1600
+  --x;
+  return x;
+}
diff --git a/clang/test/CodeGen/debug-prefix-map.c b/clang/test/CodeGen/debug-prefix-map.c
index d6032a658c2e2..5366e19447ae2 100644
--- a/clang/test/CodeGen/debug-prefix-map.c
+++ b/clang/test/CodeGen/debug-prefix-map.c
@@ -2,6 +2,8 @@
 // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH=empty %s -emit-llvm -o - | FileCheck %s -check-prefix CHECK-EVIL
 // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty %s -emit-llvm -o - -main-file-name debug-prefix-map.c | FileCheck %s
 // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty %s -emit-llvm -o - -fdebug-compilation-dir %p | FileCheck %s -check-prefix CHECK-COMPILATION-DIR
+// RUN: %clang -g -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty -S -c %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang -g -ffile-prefix-map=%p=/UNLIKELY_PATH/empty -S -c %s -emit-llvm -o - | FileCheck %s
 
 #include "Inputs/stdio.h"
 
@@ -17,21 +19,21 @@ void test_rewrite_includes() {
 }
 
 // CHECK-NO-MAIN-FILE-NAME: !DIFile(filename: "/UNLIKELY_PATH/empty{{/|\\\\}}<stdin>"
-// CHECK-NO-MAIN-FILE-NAME: !DIFile(filename: "/UNLIKELY_PATH/empty{{[/\\]}}{{.*}}",
+// CHECK-NO-MAIN-FILE-NAME: !DIFile(filename: "/UNLIKELY_PATH/empty{{/|\\\\}}{{.*}}",
 // On POSIX systems "Dir" should actually be empty, but on Windows we
 // can't recognize "/UNLIKELY_PATH" as being an absolute path.
 // CHECK-NO-MAIN-FILE-NAME-SAME:    directory: "{{()|(.*:.*)}}")
-// CHECK-NO-MAIN-FILE-NAME: !DIFile(filename: "/UNLIKELY_PATH/empty{{[/\\]}}Inputs/stdio.h",
+// CHECK-NO-MAIN-FILE-NAME: !DIFile(filename: "/UNLIKELY_PATH/empty{{/|\\\\}}Inputs/stdio.h",
 // CHECK-NO-MAIN-FILE-NAME-SAME:    directory: "{{()|(.*:.*)}}")
 // CHECK-NO-MAIN-FILE-NAME-NOT: !DIFile(filename:
 
-// CHECK-EVIL: !DIFile(filename: "/UNLIKELY_PATH=empty{{[/\\]}}{{.*}}"
-// CHECK-EVIL: !DIFile(filename: "/UNLIKELY_PATH=empty{{[/\\]}}{{.*}}Inputs/stdio.h",
+// CHECK-EVIL: !DIFile(filename: "/UNLIKELY_PATH=empty{{/|\\\\}}{{.*}}"
+// CHECK-EVIL: !DIFile(filename: "/UNLIKELY_PATH=empty{{/|\\\\}}{{.*}}Inputs/stdio.h",
 // CHECK-EVIL-SAME:    directory: "{{()|(.*:.*)}}")
 // CHECK-EVIL-NOT: !DIFile(filename:
 
-// CHECK: !DIFile(filename: "/UNLIKELY_PATH/empty{{[/\\]}}{{.*}}"
-// CHECK: !DIFile(filename: "/UNLIKELY_PATH/empty{{[/\\]}}{{.*}}Inputs/stdio.h",
+// CHECK: !DIFile(filename: "/UNLIKELY_PATH/empty{{/|\\\\}}{{.*}}"
+// CHECK: !DIFile(filename: "/UNLIKELY_PATH/empty{{/|\\\\}}{{.*}}Inputs/stdio.h",
 // CHECK-SAME:    directory: "{{()|(.*:.*)}}")
 // CHECK-NOT: !DIFile(filename:
 
diff --git a/clang/test/CodeGen/fpconstrained.c b/clang/test/CodeGen/fpconstrained.c
new file mode 100644
index 0000000000000..0a890e2e702eb
--- /dev/null
+++ b/clang/test/CodeGen/fpconstrained.c
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -ftrapping-math -frounding-math -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
+// RUN: %clang_cc1 -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -ffast-math -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
+// RUN: %clang_cc1 -ffast-math -ffp-contract=fast -ffp-exception-behavior=maytrap -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
+float f0, f1, f2;
+
+void foo() {
+  // CHECK-LABEL: define {{.*}}void @foo()
+
+  // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
+  // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  // PRECISE: fadd contract float %{{.*}}, %{{.*}}
+  // FAST: fadd fast
+  f0 = f1 + f2;
+
+  // CHECK: ret
+}
diff --git a/clang/test/CodeGen/fpconstrained.cpp b/clang/test/CodeGen/fpconstrained.cpp
new file mode 100644
index 0000000000000..7aa34c98a4879
--- /dev/null
+++ b/clang/test/CodeGen/fpconstrained.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -x c++ -ftrapping-math -fexceptions -fcxx-exceptions -frounding-math -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck %s -check-prefix=FPMODELSTRICT
+// RUN: %clang_cc1 -x c++ -ffp-contract=fast -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=PRECISE
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=ignore -emit-llvm -o - %s | FileCheck %s -check-prefix=FAST
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=strict -emit-llvm -o - %s | FileCheck %s -check-prefix=EXCEPT
+// RUN: %clang_cc1 -x c++ -ffast-math -fexceptions -fcxx-exceptions -ffp-contract=fast -ffp-exception-behavior=maytrap -emit-llvm -o - %s | FileCheck %s -check-prefix=MAYTRAP
+float f0, f1, f2;
+
+  template <class>
+  class aaaa {
+   public:
+    ~aaaa();
+    void b();
+  };
+  
+  template <class c>
+  aaaa<c>::~aaaa() { try {
+    b();
+  // CHECK-LABEL: define {{.*}}void @_ZN4aaaaIiED2Ev{{.*}}
+
+  } catch (...) {
+  // MAYTRAP: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.maytrap")
+  // EXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+  // FPMODELSTRICT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  // STRICTNOEXCEPT: llvm.experimental.constrained.fadd.f32(float %{{.*}}, float %{{.*}}, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  // PRECISE: fadd contract float %{{.*}}, %{{.*}}
+  // FAST: fadd fast
+  f0 = f1 + f2;
+
+  // CHECK: ret void
+  }
+  }
+  
+  class d {
+   public:
+    d(const char *, int);
+    aaaa<int> e;
+  };
+  
+float foo() {
+  d x("", 1);
+  aaaa<int> a;
+  return f0;
+}
+
diff --git a/clang/test/CodeGen/label-array-aggregate-init.c b/clang/test/CodeGen/label-array-aggregate-init.c
new file mode 100644
index 0000000000000..5cefd8d270c08
--- /dev/null
+++ b/clang/test/CodeGen/label-array-aggregate-init.c
@@ -0,0 +1,10 @@
+// RUN: %clang -cc1 -triple x86_64-windows-msvc -emit-llvm %s -o - | FileCheck %s
+
+// CHECK: @constinit = private global [3 x i8*] [i8* blockaddress(@main, %L), i8* null, i8* null]
+
+void receivePtrs(void **);
+
+int main() {
+L:
+  receivePtrs((void *[]){ &&L, 0, 0 });
+}
diff --git a/clang/test/CodeGenCXX/constructor-destructor-return-this.cpp b/clang/test/CodeGenCXX/constructor-destructor-return-this.cpp
index f6450e2d4d77d..7ef9e116df8b8 100644
--- a/clang/test/CodeGenCXX/constructor-destructor-return-this.cpp
+++ b/clang/test/CodeGenCXX/constructor-destructor-return-this.cpp
@@ -3,6 +3,8 @@
 //RUN: %clang_cc1 %s -emit-llvm -o - -triple=thumbv7-apple-ios5.0 -target-abi apcs-gnu | FileCheck --check-prefix=CHECKIOS5 %s
 //RUN: %clang_cc1 %s -emit-llvm -o - -triple=wasm32-unknown-unknown \
 //RUN:   | FileCheck --check-prefix=CHECKARM %s
+//RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-fuchsia | FileCheck --check-prefix=CHECKFUCHSIA %s
+//RUN: %clang_cc1 %s -emit-llvm -o - -triple=aarch64-unknown-fuchsia | FileCheck --check-prefix=CHECKFUCHSIA %s
 //RUN: %clang_cc1 %s -emit-llvm -o - -triple=i386-pc-win32 -fno-rtti | FileCheck --check-prefix=CHECKMS %s
 // FIXME: these tests crash on the bots when run with -triple=x86_64-pc-win32
 
@@ -45,6 +47,11 @@ B::~B() { }
 // CHECKIOS5-LABEL: define %class.B* @_ZN1BD2Ev(%class.B* %this)
 // CHECKIOS5-LABEL: define %class.B* @_ZN1BD1Ev(%class.B* %this)
 
+// CHECKFUCHSIA-LABEL: define %class.B* @_ZN1BC2EPi(%class.B* returned %this, i32* %i)
+// CHECKFUCHSIA-LABEL: define %class.B* @_ZN1BC1EPi(%class.B* returned %this, i32* %i)
+// CHECKFUCHSIA-LABEL: define %class.B* @_ZN1BD2Ev(%class.B* returned %this)
+// CHECKFUCHSIA-LABEL: define %class.B* @_ZN1BD1Ev(%class.B* returned %this)
+
 // CHECKMS-LABEL: define dso_local x86_thiscallcc %class.B* @"??0B@@QAE@PAH@Z"(%class.B* returned %this, i32* %i)
 // CHECKMS-LABEL: define dso_local x86_thiscallcc void @"??1B@@UAE@XZ"(%class.B* %this)
 
@@ -83,6 +90,14 @@ C::~C() { }
 // CHECKIOS5-LABEL: define void @_ZN1CD0Ev(%class.C* %this)
 // CHECKIOS5-LABEL: define void @_ZThn8_N1CD0Ev(%class.C* %this)
 
+// CHECKFUCHSIA-LABEL: define %class.C* @_ZN1CC2EPiPc(%class.C* returned %this, i32* %i, i8* %c)
+// CHECKFUCHSIA-LABEL: define %class.C* @_ZN1CC1EPiPc(%class.C* returned %this, i32* %i, i8* %c)
+// CHECKFUCHSIA-LABEL: define %class.C* @_ZN1CD2Ev(%class.C* returned %this)
+// CHECKFUCHSIA-LABEL: define %class.C* @_ZN1CD1Ev(%class.C* returned %this)
+// CHECKFUCHSIA-LABEL: define %class.C* @_ZThn16_N1CD1Ev(%class.C* %this)
+// CHECKFUCHSIA-LABEL: define void @_ZN1CD0Ev(%class.C* %this)
+// CHECKFUCHSIA-LABEL: define void @_ZThn16_N1CD0Ev(%class.C* %this)
+
 // CHECKMS-LABEL: define dso_local x86_thiscallcc %class.C* @"??0C@@QAE@PAHPAD@Z"(%class.C* returned %this, i32* %i, i8* %c)
 // CHECKMS-LABEL: define dso_local x86_thiscallcc void @"??1C@@UAE@XZ"(%class.C* %this)
 
@@ -110,6 +125,11 @@ D::~D() { }
 // CHECKIOS5-LABEL: define %class.D* @_ZN1DD2Ev(%class.D* %this, i8** %vtt)
 // CHECKIOS5-LABEL: define %class.D* @_ZN1DD1Ev(%class.D* %this)
 
+// CHECKFUCHSIA-LABEL: define %class.D* @_ZN1DC2Ev(%class.D* returned %this, i8** %vtt)
+// CHECKFUCHSIA-LABEL: define %class.D* @_ZN1DC1Ev(%class.D* returned %this)
+// CHECKFUCHSIA-LABEL: define %class.D* @_ZN1DD2Ev(%class.D* returned %this, i8** %vtt)
+// CHECKFUCHSIA-LABEL: define %class.D* @_ZN1DD1Ev(%class.D* returned %this)
+
 // CHECKMS-LABEL: define dso_local x86_thiscallcc %class.D* @"??0D@@QAE@XZ"(%class.D* returned %this, i32 %is_most_derived)
 // CHECKMS-LABEL: define dso_local x86_thiscallcc void @"??1D@@UAE@XZ"(%class.D* %this)
 
@@ -127,15 +147,15 @@ void test_destructor() {
   e2->~E();
 }
 
-// CHECKARM-LABEL: define void @_Z15test_destructorv()
+// CHECKARM-LABEL,CHECKFUCHSIA-LABEL: define void @_Z15test_destructorv()
 
 // Verify that virtual calls to destructors are not marked with a 'returned'
 // this parameter at the call site...
-// CHECKARM: [[VFN:%.*]] = getelementptr inbounds %class.E* (%class.E*)*, %class.E* (%class.E*)**
-// CHECKARM: [[THUNK:%.*]] = load %class.E* (%class.E*)*, %class.E* (%class.E*)** [[VFN]]
-// CHECKARM: call %class.E* [[THUNK]](%class.E* %
+// CHECKARM,CHECKFUCHSIA: [[VFN:%.*]] = getelementptr inbounds %class.E* (%class.E*)*, %class.E* (%class.E*)**
+// CHECKARM,CHECKFUCHSIA: [[THUNK:%.*]] = load %class.E* (%class.E*)*, %class.E* (%class.E*)** [[VFN]]
+// CHECKARM,CHECKFUCHSIA: call %class.E* [[THUNK]](%class.E* %
 
 // ...but static calls create declarations with 'returned' this
-// CHECKARM: {{%.*}} = call %class.E* @_ZN1ED1Ev(%class.E* %
+// CHECKARM,CHECKFUCHSIA: {{%.*}} = call %class.E* @_ZN1ED1Ev(%class.E* %
 
-// CHECKARM: declare %class.E* @_ZN1ED1Ev(%class.E* returned)
+// CHECKARM,CHECKFUCHSIA: declare %class.E* @_ZN1ED1Ev(%class.E* returned)
diff --git a/clang/test/CodeGenCXX/debug-info-template-align.cpp b/clang/test/CodeGenCXX/debug-info-template-align.cpp
new file mode 100644
index 0000000000000..42fdb269a30b5
--- /dev/null
+++ b/clang/test/CodeGenCXX/debug-info-template-align.cpp
@@ -0,0 +1,14 @@
+//  Test for debug info related to DW_AT_alignment attribute in the typedef operator
+// Supported: -O0, standalone DI
+// RUN: %clang_cc1 -dwarf-version=5  -emit-llvm -triple x86_64-linux-gnu %s -o - \
+// RUN:   -O0 -disable-llvm-passes \
+// RUN:   -debug-info-kind=standalone \
+// RUN: | FileCheck %s
+
+// CHECK: DIDerivedType(tag: DW_TAG_typedef, {{.*}}, align: 512
+
+typedef char __attribute__((__aligned__(64))) alchar;
+
+int main() {
+  alchar newChar;
+}
diff --git a/clang/test/CodeGenCXX/dllexport-dtor-thunks.cpp b/clang/test/CodeGenCXX/dllexport-dtor-thunks.cpp
index bda126eba855d..d2aa195e8cc3a 100644
--- a/clang/test/CodeGenCXX/dllexport-dtor-thunks.cpp
+++ b/clang/test/CodeGenCXX/dllexport-dtor-thunks.cpp
@@ -1,5 +1,12 @@
 // RUN: %clang_cc1 -mconstructor-aliases -fms-extensions %s -emit-llvm -o - -triple x86_64-windows-msvc | FileCheck %s
 
+namespace test1 {
+struct A { ~A(); };
+struct __declspec(dllexport) B : virtual A { };
+// CHECK: define weak_odr dso_local dllexport void @"??1B@test1@@QEAA@XZ"
+// CHECK: define weak_odr dso_local dllexport void @"??_DB@test1@@QEAAXXZ"
+}
+
 struct __declspec(dllexport) A { virtual ~A(); };
 struct __declspec(dllexport) B { virtual ~B(); };
 struct __declspec(dllexport) C : A, B { virtual ~C(); };
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index 6c4077a5b9a7c..045cb450b7506 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -860,6 +860,20 @@ struct PR40006 {
 };
 // M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc %"struct.InClassInits::PR40006"* @"??0PR40006@InClassInits@@QAE@XZ"
 
+namespace pr40006 {
+// Delay emitting the method also past the instantiation of Tmpl<Inner>, i.e.
+// until the top-level class Outer is completely finished.
+template<typename> struct Tmpl {};
+struct Outer {
+    struct Inner {
+        __declspec(dllexport) Inner() = default;
+        unsigned int x = 0;
+    };
+    Tmpl<Inner> y;
+};
+// M32-DAG: define weak_odr dso_local dllexport x86_thiscallcc %"struct.InClassInits::pr40006::Outer::Inner"* @"??0Inner@Outer@pr40006@InClassInits@@QAE@XZ"
+}
+
 // PR42857: Clang would try to emit the non-trivial explicitly defaulted
 // dllexport ctor twice when doing an explicit instantiation definition.
 struct Qux { Qux(); };
diff --git a/clang/test/CodeGenCXX/dllimport-dtor-thunks.cpp b/clang/test/CodeGenCXX/dllimport-dtor-thunks.cpp
index da3227a49a4b5..53aa2cdbf3eef 100644
--- a/clang/test/CodeGenCXX/dllimport-dtor-thunks.cpp
+++ b/clang/test/CodeGenCXX/dllimport-dtor-thunks.cpp
@@ -19,9 +19,9 @@ struct __declspec(dllimport) ImportOverrideVDtor : public BaseClass {
   virtual ~ImportOverrideVDtor() {}
 };
 
-// Virtually inherits from a non-dllimport base class. This time we need to call
-// the complete destructor and emit it inline. It's not exported from the DLL,
-// and it must be emitted.
+// Virtually inherits from a non-dllimport base class. In this case, we can
+// expect the DLL to provide a definition of the complete dtor. See
+// dllexport-dtor-thunks.cpp.
 struct __declspec(dllimport) ImportVBaseOverrideVDtor
     : public virtual BaseClass {
   virtual ~ImportVBaseOverrideVDtor() {}
diff --git a/clang/test/CodeGenCXX/no-unique-address-2.cpp b/clang/test/CodeGenCXX/no-unique-address-2.cpp
new file mode 100644
index 0000000000000..aa0c67758a192
--- /dev/null
+++ b/clang/test/CodeGenCXX/no-unique-address-2.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -std=c++2a %s -emit-llvm -o - -triple x86_64-linux-gnu | FileCheck %s
+
+struct TriviallyCopyable {};
+
+struct NonTriviallyCopyable {
+  NonTriviallyCopyable() = default;
+  NonTriviallyCopyable(const NonTriviallyCopyable&) = default;
+  NonTriviallyCopyable(NonTriviallyCopyable &&) {}
+};
+
+struct Foo {
+  int i;
+  [[no_unique_address]] TriviallyCopyable m;
+  [[no_unique_address]] NonTriviallyCopyable n;
+};
+
+void call() {
+  Foo foo;
+  Foo foo2(static_cast<Foo&&>(foo));
+}
+
+// The memcpy call should copy exact 4 bytes for member 'int i'
+// CHECK: define {{.*}} void @_ZN3FooC2EOS_
+// CHECK:  call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.+}}, i8* {{.+}}, i64 4, i1 false)
+// CHECK:  call void @_ZN20NonTriviallyCopyableC2EOS_
diff --git a/clang/test/CodeGenObjC/nontrivial-struct-param-init.m b/clang/test/CodeGenObjC/nontrivial-struct-param-init.m
new file mode 100644
index 0000000000000..96a63b83ac761
--- /dev/null
+++ b/clang/test/CodeGenObjC/nontrivial-struct-param-init.m
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple i386-apple-watchos6.0-simulator -emit-llvm -fblocks -fobjc-arc -o - %s | FileCheck %s
+
+// CHECK: %[[STRUCT_S:.*]] = type { i8* }
+
+typedef struct {
+  id x;
+} S;
+
+// CHECK: define void @test0(i8* %[[A_0:.*]])
+// CHECK: %[[A:.*]] = alloca %[[STRUCT_S]], align 4
+// CHECK: %[[X:.*]] = getelementptr inbounds %[[STRUCT_S]], %[[STRUCT_S]]* %[[A]], i32 0, i32 0
+// CHECK: store i8* %[[A_0]], i8** %[[X]], align 4
+// CHECK: %[[V0:.*]] = bitcast %[[STRUCT_S]]* %[[A]] to i8**
+// CHECK: call void @__destructor_4_s0(i8** %[[V0]]) #2
+
+void test0(S a) {
+}
diff --git a/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl b/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl
index 7e3186b186152..64f2d89c5818a 100644
--- a/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl
+++ b/clang/test/CodeGenOpenCL/fdeclare-opencl-builtins.cl
@@ -16,6 +16,13 @@ kernel void test_pure_attr(read_only image1d_t img) {
   float4 resf = read_imagef(img, 42);
 }
 
+// Test that builtins with only one prototype are mangled.
+// CHECK-LABEL: @test_mangling
+// CHECK: call i32 @_Z12get_local_idj
+kernel void test_mangling() {
+  size_t lid = get_local_id(0);
+}
+
 // CHECK: attributes [[ATTR_CONST]] =
 // CHECK-SAME: readnone
 // CHECK: attributes [[ATTR_PURE]] =
diff --git a/clang/test/CodeGenOpenCLCXX/addrspace-derived-base.cl b/clang/test/CodeGenOpenCLCXX/addrspace-derived-base.cl
index d5d369fa80bb6..623d201c21800 100644
--- a/clang/test/CodeGenOpenCLCXX/addrspace-derived-base.cl
+++ b/clang/test/CodeGenOpenCLCXX/addrspace-derived-base.cl
@@ -69,3 +69,14 @@ void pr43145_3(int n) {
   // CHECK: bitcast i8 addrspace(4)* %add.ptr1 to %class.B2 addrspace(4)*
   // CHECK: call {{.*}} @_ZNU3AS42B26getRefEv
 }
+
+// Implicit conversion of derived to base.
+
+void functionWithBaseArgPtr(class B2 *b) {}
+void functionWithBaseArgRef(class B2 &b) {}
+
+void pr43145_4() {
+  Derived d;
+  functionWithBaseArgPtr(&d);
+  functionWithBaseArgRef(d);
+}
diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
index 2b5def6c62a98..a57fbf7e74e52 100644
--- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp
+++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp
@@ -137,7 +137,7 @@ class accessor {
   _ImplT<dimensions> impl;
 
 private:
-  void __init(__attribute__((ocl_global)) dataT *Ptr, range<dimensions> AccessRange,
+  void __init(__attribute__((opencl_global)) dataT *Ptr, range<dimensions> AccessRange,
               range<dimensions> MemRange, id<dimensions> Offset) {}
 };
 
diff --git a/clang/test/CodeGenSYCL/address-space-parameter-conversions.cpp b/clang/test/CodeGenSYCL/address-space-parameter-conversions.cpp
index 1d5beced187bd..4697c57363bb4 100644
--- a/clang/test/CodeGenSYCL/address-space-parameter-conversions.cpp
+++ b/clang/test/CodeGenSYCL/address-space-parameter-conversions.cpp
@@ -3,7 +3,7 @@ void bar(int & Data) {}
 // CHECK-DAG: define spir_func void @[[RAW_REF:[a-zA-Z0-9_]+]](i32 addrspace(4)* dereferenceable(4) %
 void bar2(int & Data) {}
 // CHECK-DAG: define spir_func void @[[RAW_REF2:[a-zA-Z0-9_]+]](i32 addrspace(4)* dereferenceable(4) %
-void bar(__attribute__((ocl_local)) int &Data) {}
+void bar(__attribute__((opencl_local)) int &Data) {}
 // CHECK-DAG: define spir_func void [[LOC_REF:@[a-zA-Z0-9_]+]](i32 addrspace(3)* dereferenceable(4) %
 void foo(int * Data) {}
 // CHECK-DAG: define spir_func void @[[RAW_PTR:[a-zA-Z0-9_]+]](i32 addrspace(4)* %
@@ -20,12 +20,12 @@ void usages() {
   // CHECK-DAG: [[GLOB:%[a-zA-Z0-9]+]] = alloca i32 addrspace(1)*
   __attribute__((address_space(1))) int *GLOB;
   // CHECK-DAG: [[LOC:%[a-zA-Z0-9]+]] = alloca i32 addrspace(3)*
-  __attribute__((ocl_local)) int *LOC;
+  __attribute__((opencl_local)) int *LOC;
   // CHECK-DAG: [[NoAS:%[a-zA-Z0-9]+]] = alloca i32 addrspace(4)*
   int *NoAS;
 
   // CHECK-DAG: [[PRIV:%[a-zA-Z0-9]+]] = alloca i32*
-  __attribute__((ocl_private)) int *PRIV;
+  __attribute__((opencl_private)) int *PRIV;
 
   bar(*GLOB);
   // CHECK-DAG: [[GLOB_LOAD:%[a-zA-Z0-9]+]] = load i32 addrspace(1)*, i32 addrspace(1)** [[GLOB]]
@@ -98,19 +98,19 @@ void usages2() {
   // CHECK-DAG: [[PRIV_NUM:%[a-zA-Z0-9_]+]] = alloca i32*
   __attribute__((address_space(0))) int *PRIV_NUM2;
   // CHECK-DAG: [[PRIV_NUM2:%[a-zA-Z0-9_]+]] = alloca i32*
-  __attribute__((ocl_private)) int *PRIV;
+  __attribute__((opencl_private)) int *PRIV;
   // CHECK-DAG: [[PRIV:%[a-zA-Z0-9_]+]] = alloca i32*
   __attribute__((address_space(1))) int *GLOB_NUM;
   // CHECK-DAG: [[GLOB_NUM:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(1)*
-  __attribute__((ocl_global)) int *GLOB;
+  __attribute__((opencl_global)) int *GLOB;
   // CHECK-DAG: [[GLOB:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(1)*
   __attribute__((address_space(2))) int *CONST_NUM;
   // CHECK-DAG: [[CONST_NUM:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(2)*
-  __attribute__((ocl_constant)) int *CONST;
+  __attribute__((opencl_constant)) int *CONST;
   // CHECK-DAG: [[CONST:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(2)*
   __attribute__((address_space(3))) int *LOCAL_NUM;
   // CHECK-DAG: [[LOCAL_NUM:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(3)*
-  __attribute__((ocl_local)) int *LOCAL;
+  __attribute__((opencl_local)) int *LOCAL;
   // CHECK-DAG: [[LOCAL:%[a-zA-Z0-9_]+]] = alloca i32 addrspace(3)*
 
   bar(*PRIV_NUM);
diff --git a/clang/test/CoverageMapping/switch.cpp b/clang/test/CoverageMapping/switch.cpp
index 30c64922201f4..25ea4053f4e2c 100644
--- a/clang/test/CoverageMapping/switch.cpp
+++ b/clang/test/CoverageMapping/switch.cpp
@@ -2,11 +2,11 @@
 
                     // CHECK: foo
 void foo(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+8]]:2 = #0
-  switch(i) {
+  switch(i) {       // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+4]]:10 = 0
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:11 = #2
     return;
   case 2:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #3
-    break;          // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+2]]:3 = #1
+    break;          // CHECK-NEXT: Gap,File 0, [[@LINE]]:10 -> [[@LINE+2]]:3 = #1
   }
   int x = 0;        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:2 = #1
 }
@@ -29,7 +29,7 @@ void bar(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+20]]:2 = #0
     nop();
 
   switch (i) {      // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+6]]:2 = #4
-    nop();          // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE+2]]:10 = 0
+    nop();          // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:14 -> [[@LINE+2]]:10 = 0
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #7
     nop();
   }
@@ -47,7 +47,7 @@ void baz() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+5]]:2 = #0
                     // CHECK-NEXT: main
 int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+35]]:2 = #0
   int i = 0;
-  switch(i) {
+  switch(i) {       // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+8]]:10 = 0
   case 0:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = #2
     i = 1;
     break;
@@ -58,16 +58,16 @@ int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+35]]:2 = #0
     break;          // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+2]]:3 = #1
   }
   switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+23]]:2 = #1
-  case 0:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = #6
-    i = 1;
+  case 0:           // CHECK-NEXT: Gap,File 0, [[@LINE-1]]:13 -> [[@LINE+6]]:10 = 0
+    i = 1;          // CHECK-NEXT: File 0, [[@LINE-1]]:3 -> [[@LINE+1]]:10 = #6
     break;
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = #7
     i = 2;
   default:          // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = (#7 + #8)
     break;          // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+3]]:3 = #5
   }
-
-  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+13]]:2 = #5
+                    // CHECK-NEXT: File 0, [[@LINE+1]]:3 -> [[@LINE+14]]:2 = #5
+  switch(i) {       // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+6]]:11 = 0
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+5]]:11 = #10
   case 2:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+4]]:11 = (#10 + #11)
     i = 11;
@@ -82,10 +82,23 @@ int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+35]]:2 = #0
   return 0;
 }
 
+                     // CHECK: pr44011
+int pr44011(int i) { // CHECK-NEXT: File 0, [[@LINE]]:20 -> {{.*}}:2 = #0
+  switch (i) {       // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> [[@LINE+6]]:13 = 0
+
+  case 1:            // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 = #2
+    return 0;
+
+  default:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 = #3
+    return 1;
+  }
+} // A region for counter #1 is missing due to the missing return.
+
+
 // FIXME: End location for "case 1" shouldn't point at the end of the switch.
                          // CHECK: fallthrough
 int fallthrough(int i) { // CHECK-NEXT: File 0, [[@LINE]]:24 -> [[@LINE+12]]:2 = #0
-  switch(i) {
+  switch(i) {       // CHECK-NEXT: Gap,File 0, [[@LINE]]:13 -> [[@LINE+9]]:10 = 0
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+8]]:10 = #2
     i = 23;
   case 2:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = (#2 + #3)
@@ -101,7 +114,7 @@ int fallthrough(int i) { // CHECK-NEXT: File 0, [[@LINE]]:24 -> [[@LINE+12]]:2 =
 void abort(void) __attribute((noreturn));
                    // CHECK: noret
 int noret(int x) { // CHECK-NEXT: File 0, [[@LINE]]:18 -> [[@LINE+9]]:2
-  switch (x) {
+  switch (x) {     // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> [[@LINE+6]]:14 = 0
   default:         // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:12
     abort();
   case 1:         // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13
diff --git a/clang/test/CoverageMapping/switchmacro.c b/clang/test/CoverageMapping/switchmacro.c
index f4c14f798f0be..fc0392fb91e53 100644
--- a/clang/test/CoverageMapping/switchmacro.c
+++ b/clang/test/CoverageMapping/switchmacro.c
@@ -4,7 +4,7 @@
 
 // CHECK: foo
 int foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:16 -> {{[0-9]+}}:2 = #0
-  switch (i) {
+  switch (i) {   // CHECK-NEXT: Gap,File 0, [[@LINE]]:14 -> {{[0-9]+}}:11 = 0
   default:       // CHECK-NEXT: File 0, [[@LINE]]:3 -> {{[0-9]+}}:11 = #2
     if (i == 1)  // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = #2
       return 0;  // CHECK: File 0, [[@LINE]]:7 -> [[@LINE]]:15 = #3
diff --git a/clang/test/Driver/Inputs/aix_ppc_tree/dummy0.s b/clang/test/Driver/Inputs/aix_ppc_tree/dummy0.s
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/aix_ppc_tree/dummy1.s b/clang/test/Driver/Inputs/aix_ppc_tree/dummy1.s
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/Inputs/aix_ppc_tree/dummy2.s b/clang/test/Driver/Inputs/aix_ppc_tree/dummy2.s
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/clang/test/Driver/aix-as.c b/clang/test/Driver/aix-as.c
new file mode 100644
index 0000000000000..4f67d1ba90b70
--- /dev/null
+++ b/clang/test/Driver/aix-as.c
@@ -0,0 +1,73 @@
+// General tests that as(1) invocations on AIX targets are sane. Note that we
+// only test assembler functionalities in this suite.
+
+// Check powerpc-ibm-aix7.1.0.0, 32-bit.
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -target powerpc-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-AS32 %s
+// CHECK-AS32-NOT: warning:
+// CHECK-AS32: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-AS32: "{{.*}}as{{(.exe)?}}" 
+// CHECK-AS32: "-a32" 
+// CHECK-AS32: "-u" 
+// CHECK-AS32: "-many" 
+
+// Check powerpc64-ibm-aix7.1.0.0, 64-bit.
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-AS64 %s
+// CHECK-AS64-NOT: warning:
+// CHECK-AS64: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-AS64: "{{.*}}as{{(.exe)?}}" 
+// CHECK-AS64: "-a64" 
+// CHECK-AS64: "-u" 
+// CHECK-AS64: "-many"
+
+
+// Check powerpc-ibm-aix7.1.0.0, 32-bit. -Xassembler <arg> option. 
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -Xassembler -w \
+// RUN:         -target powerpc-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-AS32-Xassembler %s
+// CHECK-AS32-Xassembler-NOT: warning:
+// CHECK-AS32-Xassembler: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc-ibm-aix7.1.0.0"
+// CHECK-AS32-Xassembler: "{{.*}}as{{(.exe)?}}" 
+// CHECK-AS32-Xassembler: "-a32" 
+// CHECK-AS32-Xassembler: "-u" 
+// CHECK-AS32-Xassembler: "-many"
+// CHECK-AS32-Xassembler: "-w"
+
+// Check powerpc64-ibm-aix7.1.0.0, 64-bit. -Wa,<arg>,<arg> option.
+// RUN: %clang -no-canonical-prefixes %s -### -c -o %t.o 2>&1 \
+// RUN:         -Wa,-v,-w \
+// RUN:         -target powerpc64-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-AS64-Wa %s
+// CHECK-AS64-Wa-NOT: warning:
+// CHECK-AS64-Wa: {{.*}}clang{{(.exe)?}}" "-cc1" "-triple" "powerpc64-ibm-aix7.1.0.0"
+// CHECK-AS64-Wa: "{{.*}}as{{(.exe)?}}" 
+// CHECK-AS64-Wa: "-a64" 
+// CHECK-AS64-Wa: "-u" 
+// CHECK-AS64-Wa: "-many"
+// CHECK-AS64-Wa: "-v"
+// CHECK-AS64-Wa: "-w"
+
+// Check powerpc-ibm-aix7.1.0.0, 32-bit. Multiple input files.
+// RUN: %clang -no-canonical-prefixes -### -c \
+// RUN:         %S/Inputs/aix_ppc_tree/dummy0.s \
+// RUN:         %S/Inputs/aix_ppc_tree/dummy1.s \
+// RUN:         %S/Inputs/aix_ppc_tree/dummy2.s 2>&1 \
+// RUN:         -target powerpc-ibm-aix7.1.0.0 \
+// RUN:   | FileCheck --check-prefix=CHECK-AS32-MultiInput %s
+// CHECK-AS32-MultiInput-NOT: warning:
+// CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}"
+// CHECK-AS32-MultiInput: "-a32"
+// CHECK-AS32-MultiInput: "-u"
+// CHECK-AS32-MultiInput: "-many"
+// CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}"
+// CHECK-AS32-MultiInput: "-a32"
+// CHECK-AS32-MultiInput: "-u"
+// CHECK-AS32-MultiInput: "-many"
+// CHECK-AS32-MultiInput: "{{.*}}as{{(.exe)?}}"
+// CHECK-AS32-MultiInput: "-a32"
+// CHECK-AS32-MultiInput: "-u"
+// CHECK-AS32-MultiInput: "-many"
diff --git a/clang/test/Driver/arm-reserved-reg-options.c b/clang/test/Driver/arm-reserved-reg-options.c
deleted file mode 100644
index e97c717d7e7e7..0000000000000
--- a/clang/test/Driver/arm-reserved-reg-options.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// ## FP ARM + Thumb
-// RUN: %clang -target arm-arm-none-eabi -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R11 %s
-// RUN: %clang -target arm-arm-none-eabi -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// RUN: %clang -target arm-arm-none-eabi -### -ffixed-r7 -mthumb -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R7 %s
-// RUN: %clang -target arm-arm-none-eabi -### -ffixed-r11 -mthumb -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// RUN: %clang -target thumbv6m-none-eabi -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R7 %s
-// RUN: %clang -target thumbv6m-none-eabi -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// ## FP Darwin (R7)
-// RUN: %clang -target armv6-apple-darwin9 -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R7 %s
-// RUN: %clang -target armv6-apple-darwin9 -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// RUN: %clang -target armv6-apple-ios3 -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R7 %s
-// RUN: %clang -target armv6-apple-ios3 -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// RUN: %clang -target armv7s-apple-darwin10 -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R7 %s
-// RUN: %clang -target armv7s-apple-darwin10 -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// ## FP Windows (R11)
-// RUN: %clang -target armv7-windows -### -ffixed-r11 -c %s 2>&1 | FileCheck -check-prefix=CHECK-ERROR-R11 %s
-// RUN: %clang -target armv7-windows -### -ffixed-r7 -c %s 2>&1 | FileCheck -check-prefix=CHECK-NO-ERROR %s
-
-// ## FRWPI (R9)
-// RUN: %clang -target arm-arm-none-eabi -### -frwpi -ffixed-r9 -c %s 2>&1 | FileCheck -check-prefix=CHECK-RESERVED-FRWPI-CONFLICT %s
-// RUN: %clang -target arm-arm-none-eabi -### -ffixed-r9 -c %s 2>&1 | FileCheck -check-prefix=CHECK-RESERVED-FRWPI-VALID %s
-// RUN: %clang -target arm-arm-none-eabi -### -frwpi -c %s 2>&1 | FileCheck -check-prefix=CHECK-RESERVED-FRWPI-VALID %s
-
-// CHECK-ERROR-R11: error: '-ffixed-r11' has been specified but 'r11' is used as the frame pointer for this target
-// CHECK-ERROR-R7: error: '-ffixed-r7' has been specified but 'r7' is used as the frame pointer for this target
-// CHECK-NO-ERROR-NOT: may still be used as a frame pointer
-
-// CHECK-RESERVED-FRWPI-CONFLICT: option '-ffixed-r9' cannot be specified with '-frwpi'
-// CHECK-RESERVED-FRWPI-VALID-NOT: option '-ffixed-r9' cannot be specified with '-frwpi'
diff --git a/clang/test/Driver/check-time-trace.cpp b/clang/test/Driver/check-time-trace.cpp
index 3c6a002ae8ab9..bff2c1984daa9 100644
--- a/clang/test/Driver/check-time-trace.cpp
+++ b/clang/test/Driver/check-time-trace.cpp
@@ -12,7 +12,7 @@
 // CHECK-NEXT: "pid":
 // CHECK-NEXT: "tid":
 // CHECK-NEXT: "ts":
-// CHECK: "name": "clang"
+// CHECK: "name": "clang{{.*}}"
 // CHECK: "name": "process_name"
 
 template <typename T>
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index 17feaab26ab79..fef9cbfb115e0 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -198,6 +198,22 @@
 // CHECK-EXTENDED-IDENTIFIERS-NOT: "-fextended-identifiers"
 // CHECK-NO-EXTENDED-IDENTIFIERS: error: unsupported option '-fno-extended-identifiers'
 
+// RUN: %clang -### -S -frounding-math %s 2>&1 | FileCheck -check-prefix=CHECK-ROUNDING-MATH %s
+// CHECK-ROUNDING-MATH: "-cc1"
+// CHECK-ROUNDING-MATH: "-frounding-math"
+// CHECK-ROUNDING-MATH-NOT: "-fno-rounding-math"
+// RUN: %clang -### -S %s 2>&1 | FileCheck -check-prefix=CHECK-ROUNDING-MATH-NOT %s
+// RUN: %clang -### -S -ffp-model=imprecise %s 2>&1 | FileCheck -check-prefix=CHECK-FPMODEL %s
+// CHECK-FPMODEL: unsupported argument 'imprecise' to option 'ffp-model='
+// RUN: %clang -### -S -ffp-model=precise %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+// RUN: %clang -### -S -ffp-model=strict %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+// RUN: %clang -### -S -ffp-model=fast %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+// RUN: %clang -### -S -ffp-exception-behavior=trap %s 2>&1 | FileCheck -check-prefix=CHECK-FPEB %s
+// CHECK-FPEB: unsupported argument 'trap' to option 'ffp-exception-behavior='
+// RUN: %clang -### -S -ffp-exception-behavior=maytrap %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+// RUN: %clang -### -S -ffp-exception-behavior=ignore %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+// RUN: %clang -### -S -ffp-exception-behavior=strict %s 2>&1 | FileCheck -check-prefix=IGNORE %s
+
 // RUN: %clang -### -S -fno-pascal-strings -mpascal-strings %s 2>&1 | FileCheck -check-prefix=CHECK-M-PASCAL-STRINGS %s
 // CHECK-M-PASCAL-STRINGS: "-fpascal-strings"
 
@@ -320,7 +336,6 @@
 // RUN: -fprefetch-loop-arrays                                                \
 // RUN: -fprofile-correction                                                  \
 // RUN: -fprofile-values                                                      \
-// RUN: -frounding-math                                                       \
 // RUN: -fschedule-insns                                                      \
 // RUN: -fsignaling-nans                                                      \
 // RUN: -fstrength-reduce                                                     \
@@ -385,7 +400,6 @@
 // CHECK-WARNING-DAG: optimization flag '-fprefetch-loop-arrays' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fprofile-correction' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fprofile-values' is not supported
-// CHECK-WARNING-DAG: optimization flag '-frounding-math' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fschedule-insns' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fsignaling-nans' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fstrength-reduce' is not supported
diff --git a/clang/test/Driver/darwin-opt-record.c b/clang/test/Driver/darwin-opt-record.c
index ca0fad7ee16d3..7c674819663a5 100644
--- a/clang/test/Driver/darwin-opt-record.c
+++ b/clang/test/Driver/darwin-opt-record.c
@@ -1,6 +1,6 @@
 // REQUIRES: system-darwin
 
-// RUN: %clang -### -S -o FOO -fsave-optimization-record -arch x86_64 -arch x86_64h %s 2>&1 | FileCheck %s --check-prefix=CHECK-MULTIPLE-ARCH
+// RUN: %clang -target x86_64-apple-darwin10 -### -c -o FOO -fsave-optimization-record -arch x86_64 -arch x86_64h %s 2>&1 | FileCheck %s --check-prefix=CHECK-MULTIPLE-ARCH
 //
 // CHECK-MULTIPLE-ARCH: "-cc1"
 // CHECK-MULTIPLE-ARCH: "-opt-record-file" "FOO-x86_64.opt.yaml"
diff --git a/clang/test/Driver/debug-prefix-map.S b/clang/test/Driver/debug-prefix-map.S
index 2ba66be0edfce..7d12a17479726 100644
--- a/clang/test/Driver/debug-prefix-map.S
+++ b/clang/test/Driver/debug-prefix-map.S
@@ -1,4 +1,5 @@
 // RUN: %clang -### -g -fdebug-prefix-map=old=new %s 2>&1 | FileCheck %s
+// RUN: %clang -### -g -ffile-prefix-map=old=new %s 2>&1 | FileCheck %s
 
 // CHECK: cc1as
 // CHECK-SAME: -fdebug-prefix-map=old=new
diff --git a/clang/test/Driver/debug-prefix-map.c b/clang/test/Driver/debug-prefix-map.c
index b4f3859f982ab..f2c87cb7c11c9 100644
--- a/clang/test/Driver/debug-prefix-map.c
+++ b/clang/test/Driver/debug-prefix-map.c
@@ -1,9 +1,28 @@
-// RUN: %clang -### -fdebug-prefix-map=old %s 2>&1 | FileCheck %s -check-prefix CHECK-INVALID
-// RUN: %clang -### -fdebug-prefix-map=old=new %s 2>&1 | FileCheck %s -check-prefix CHECK-SIMPLE
-// RUN: %clang -### -fdebug-prefix-map=old=n=ew %s 2>&1 | FileCheck %s -check-prefix CHECK-COMPLEX
-// RUN: %clang -### -fdebug-prefix-map=old= %s 2>&1 | FileCheck %s -check-prefix CHECK-EMPTY
-
-// CHECK-INVALID: error: invalid argument 'old' to -fdebug-prefix-map
-// CHECK-SIMPLE: fdebug-prefix-map=old=new
-// CHECK-COMPLEX: fdebug-prefix-map=old=n=ew
-// CHECK-EMPTY: fdebug-prefix-map=old=
+// RUN: %clang -### -fdebug-prefix-map=old %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-INVALID
+// RUN: %clang -### -fmacro-prefix-map=old %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-INVALID
+// RUN: %clang -### -ffile-prefix-map=old %s 2>&1 | FileCheck %s -check-prefix CHECK-FILE-INVALID
+
+// RUN: %clang -### -fdebug-prefix-map=old=new %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-SIMPLE
+// RUN: %clang -### -fmacro-prefix-map=old=new %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-SIMPLE
+// RUN: %clang -### -ffile-prefix-map=old=new %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-SIMPLE
+// RUN: %clang -### -ffile-prefix-map=old=new %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-SIMPLE
+
+// RUN: %clang -### -fdebug-prefix-map=old=n=ew %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-COMPLEX
+// RUN: %clang -### -fmacro-prefix-map=old=n=ew %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-COMPLEX
+// RUN: %clang -### -ffile-prefix-map=old=n=ew %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-COMPLEX
+// RUN: %clang -### -ffile-prefix-map=old=n=ew %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-COMPLEX
+
+// RUN: %clang -### -fdebug-prefix-map=old= %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-EMPTY
+// RUN: %clang -### -fmacro-prefix-map=old= %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-EMPTY
+// RUN: %clang -### -ffile-prefix-map=old= %s 2>&1 | FileCheck %s -check-prefix CHECK-DEBUG-EMPTY
+// RUN: %clang -### -ffile-prefix-map=old= %s 2>&1 | FileCheck %s -check-prefix CHECK-MACRO-EMPTY
+
+// CHECK-DEBUG-INVALID: error: invalid argument 'old' to -fdebug-prefix-map
+// CHECK-MACRO-INVALID: error: invalid argument 'old' to -fmacro-prefix-map
+// CHECK-FILE-INVALID: error: invalid argument 'old' to -ffile-prefix-map
+// CHECK-DEBUG-SIMPLE: fdebug-prefix-map=old=new
+// CHECK-MACRO-SIMPLE: fmacro-prefix-map=old=new
+// CHECK-DEBUG-COMPLEX: fdebug-prefix-map=old=n=ew
+// CHECK-MACRO-COMPLEX: fmacro-prefix-map=old=n=ew
+// CHECK-DEBUG-EMPTY: fdebug-prefix-map=old=
+// CHECK-MACRO-EMPTY: fmacro-prefix-map=old=
diff --git a/clang/test/Driver/fast-math.c b/clang/test/Driver/fast-math.c
index 916384216d8c5..da47de260dc90 100644
--- a/clang/test/Driver/fast-math.c
+++ b/clang/test/Driver/fast-math.c
@@ -170,11 +170,11 @@
 // RUN: %clang -### -fno-fast-math -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST-MATH %s
 // RUN: %clang -### -funsafe-math-optimizations -ffinite-math-only \
-// RUN:     -fno-math-errno -ffp-contract=fast -c %s 2>&1 \
+// RUN:     -fno-math-errno -ffp-contract=fast -fno-rounding-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST-MATH %s
 // RUN: %clang -### -fno-honor-infinities -fno-honor-nans -fno-math-errno \
 // RUN:     -fassociative-math -freciprocal-math -fno-signed-zeros \
-// RUN:     -fno-trapping-math -ffp-contract=fast -c %s 2>&1 \
+// RUN:     -fno-trapping-math -ffp-contract=fast -fno-rounding-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-FAST-MATH %s
 // CHECK-FAST-MATH: "-cc1"
 // CHECK-FAST-MATH: "-ffast-math"
diff --git a/clang/test/Driver/fp-model.c b/clang/test/Driver/fp-model.c
new file mode 100644
index 0000000000000..a3984acef62b2
--- /dev/null
+++ b/clang/test/Driver/fp-model.c
@@ -0,0 +1,137 @@
+// Test that incompatible combinations of -ffp-model= options
+// and other floating point options get a warning diagnostic.
+//
+// REQUIRES: clang-driver
+
+// RUN: %clang -### -ffp-model=fast -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN %s
+// WARN: warning: overriding '-ffp-model=fast' option with '-ffp-contract=off' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=fast -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN1 %s
+// WARN1: warning: overriding '-ffp-model=fast' option with '-ffp-contract=on' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fassociative-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN2 %s
+// WARN2: warning: overriding '-ffp-model=strict' option with '-fassociative-math' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -ffast-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN3 %s
+// WARN3: warning: overriding '-ffp-model=strict' option with '-ffast-math' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -ffinite-math-only -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN4 %s
+// WARN4: warning: overriding '-ffp-model=strict' option with '-ffinite-math-only' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -ffp-contract=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN5 %s
+// WARN5: warning: overriding '-ffp-model=strict' option with '-ffp-contract=fast' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -ffp-contract=off -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN6 %s
+// WARN6: warning: overriding '-ffp-model=strict' option with '-ffp-contract=off' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -ffp-contract=on -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN7 %s
+// WARN7: warning: overriding '-ffp-model=strict' option with '-ffp-contract=on' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fno-honor-infinities -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN8 %s
+// WARN8: warning: overriding '-ffp-model=strict' option with '-fno-honor-infinities' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fno-honor-nans -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARN9 %s
+// WARN9: warning: overriding '-ffp-model=strict' option with '-fno-honor-nans' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fno-rounding-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNa %s
+// WARNa: warning: overriding '-ffp-model=strict' option with '-fno-rounding-math' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fno-signed-zeros -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNb %s
+// WARNb: warning: overriding '-ffp-model=strict' option with '-fno-signed-zeros' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -fno-trapping-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNc %s
+// WARNc: warning: overriding '-ffp-model=strict' option with '-fno-trapping-math' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -freciprocal-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNd %s
+// WARNd: warning: overriding '-ffp-model=strict' option with '-freciprocal-math' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -funsafe-math-optimizations -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNe %s
+// WARNe: warning: overriding '-ffp-model=strict' option with '-funsafe-math-optimizations' [-Woverriding-t-option]
+
+// RUN: %clang -### -ffp-model=strict -Ofast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=WARNf %s
+// WARNf: warning: overriding '-ffp-model=strict' option with '-Ofast' [-Woverriding-t-option]
+
+// RUN: %clang -### -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-NOROUND %s
+// CHECK-NOROUND: "-cc1"
+// CHECK-NOROUND: "-fno-rounding-math"
+
+// RUN: %clang -### -frounding-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ROUND --implicit-check-not ffp-exception-behavior=strict %s
+// CHECK-ROUND: "-cc1"
+// CHECK-ROUND: "-frounding-math"
+
+// RUN: %clang -### -ftrapping-math -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-TRAP %s
+// CHECK-TRAP: "-cc1"
+// CHECK-TRAP: "-ftrapping-math"
+// CHECK-TRAP: "-ffp-exception-behavior=strict"
+
+// RUN: %clang -### -nostdinc -ffp-model=fast -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FPM-FAST %s
+// CHECK-FPM-FAST: "-cc1"
+// CHECK-FPM-FAST: "-menable-no-infs"
+// CHECK-FPM-FAST: "-menable-no-nans"
+// CHECK-FPM-FAST: "-menable-unsafe-fp-math"
+// CHECK-FPM-FAST: "-fno-signed-zeros"
+// CHECK-FPM-FAST: "-mreassociate"
+// CHECK-FPM-FAST: "-freciprocal-math"
+// CHECK-FPM-FAST: "-ffp-contract=fast"
+// CHECK-FPM-FAST: "-fno-rounding-math"
+// CHECK-FPM-FAST: "-ffast-math"
+// CHECK-FPM-FAST: "-ffinite-math-only"
+
+// RUN: %clang -### -nostdinc -ffp-model=precise -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FPM-PRECISE %s
+// CHECK-FPM-PRECISE: "-cc1"
+// CHECK-FPM-PRECISE: "-ffp-contract=fast"
+// CHECK-FPM-PRECISE: "-fno-rounding-math"
+
+// RUN: %clang -### -nostdinc -ffp-model=strict -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FPM-STRICT %s
+// CHECK-FPM-STRICT: "-cc1"
+// CHECK-FPM-STRICT: "-ftrapping-math"
+// CHECK-FPM-STRICT: "-frounding-math"
+// CHECK-FPM-STRICT: "-ffp-exception-behavior=strict"
+
+// RUN: %clang -### -nostdinc -ftrapping-math -ffp-exception-behavior=ignore -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-TRAP-IGNORE %s
+// CHECK-TRAP-IGNORE: "-cc1"
+// CHECK-TRAP-IGNORE: "-fno-rounding-math"
+// CHECK-TRAP-IGNORE: "-ffp-exception-behavior=ignore"
+
+
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=strict -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FEB-STRICT %s
+// CHECK-FEB-STRICT: "-cc1"
+// CHECK-FEB-STRICT: "-fno-rounding-math"
+// CHECK-FEB-STRICT: "-ffp-exception-behavior=strict"
+
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=maytrap -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FEB-MAYTRAP %s
+// CHECK-FEB-MAYTRAP: "-cc1"
+// CHECK-FEB-MAYTRAP: "-fno-rounding-math"
+// CHECK-FEB-MAYTRAP: "-ffp-exception-behavior=maytrap"
+
+// RUN: %clang -### -nostdinc -ffp-exception-behavior=ignore -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-FEB-IGNORE %s
+// CHECK-FEB-IGNORE: "-cc1"
+// CHECK-FEB-IGNORE: "-fno-rounding-math"
+// CHECK-FEB-IGNORE: "-ffp-exception-behavior=ignore"
+
diff --git a/clang/test/Driver/fuse-ld.c b/clang/test/Driver/fuse-ld.c
index 4b2ec7b1bb2ae..13e709ccfdfa4 100644
--- a/clang/test/Driver/fuse-ld.c
+++ b/clang/test/Driver/fuse-ld.c
@@ -79,13 +79,13 @@
 // RUN: %clang %s -### -fuse-ld=lld \
 // RUN:     -target i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD
-// CHECK-WINDOWS-MSVC-LLD: "{{.*}}lld-link"
+// CHECK-WINDOWS-MSVC-LLD: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=lld-link \
 // RUN:     -target i686-unknown-windows-msvc 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-WINDOWS-MSVC-LLD-LINK
-// CHECK-WINDOWS-MSVC-LLD-LINK: "{{.*}}lld-link"
+// CHECK-WINDOWS-MSVC-LLD-LINK: "{{.*}}lld-link{{\.exe"|"}}
 // CHECK-WINDOWS-MSVC-LLD-LINK-SAME: "-out:{{.*}}"
 
 // RUN: %clang %s -### -fuse-ld=bfd \
diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip
index 59c1927330c03..14401a947e6f6 100644
--- a/clang/test/Driver/hip-device-libs.hip
+++ b/clang/test/Driver/hip-device-libs.hip
@@ -22,7 +22,6 @@
 
 // COM: {{"[^"]*clang[^"]*"}}
 // COM-SAME: "-mlink-builtin-bitcode" "{{.*}}hip.amdgcn.bc"
-// COM-SAME: "-mlink-builtin-bitcode" "{{.*}}opencl.amdgcn.bc"
 // COM-SAME: "-mlink-builtin-bitcode" "{{.*}}ocml.amdgcn.bc"
 // COM-SAME: "-mlink-builtin-bitcode" "{{.*}}ockl.amdgcn.bc"
 // FLUSHD-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_on.amdgcn.bc"
diff --git a/clang/test/Index/index-module-with-vfs.m b/clang/test/Index/index-module-with-vfs.m
index 46fa68dfa1308..06944d372d49b 100644
--- a/clang/test/Index/index-module-with-vfs.m
+++ b/clang/test/Index/index-module-with-vfs.m
@@ -6,7 +6,7 @@ void foo() {
 }
 
 // RUN: rm -rf %t.cache
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: c-index-test -index-file %s -fmodules-cache-path=%t.cache -fmodules -F %t -I %t \
 // RUN:              -ivfsoverlay %t.yaml -Xclang -fdisable-module-hash | FileCheck %s
 
diff --git a/clang/test/Index/pragma-diag-reparse.c b/clang/test/Index/pragma-diag-reparse.c
index 71d0618d70928..aa1413cda089a 100644
--- a/clang/test/Index/pragma-diag-reparse.c
+++ b/clang/test/Index/pragma-diag-reparse.c
@@ -11,6 +11,7 @@ int main (int argc, const char * argv[])
   return x;
 }
 
+#pragma clang diagnostic ignored "-Wmisleading-indentation"
 void foo() { int b=0; while (b==b); }
 
 // RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_FAILONERROR=1 c-index-test -test-load-source-reparse 5 local \
diff --git a/clang/test/InterfaceStubs/XlinkerInputArgs.cpp b/clang/test/InterfaceStubs/XlinkerInputArgs.cpp
new file mode 100644
index 0000000000000..cb4ef8aca952d
--- /dev/null
+++ b/clang/test/InterfaceStubs/XlinkerInputArgs.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang -### -Xlinker -Bsymbolic -emit-interface-stubs 2>&1 | FileCheck %s
+// CHECK: Bsymbolic
+// CHECK-NOT: Bsymbolic
diff --git a/clang/test/InterfaceStubs/constructor-using-shadow.cpp b/clang/test/InterfaceStubs/constructor-using-shadow.cpp
new file mode 100644
index 0000000000000..d4b85ac73e56d
--- /dev/null
+++ b/clang/test/InterfaceStubs/constructor-using-shadow.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
+
+// CHECK:      --- !experimental-ifs-v1
+// CHECK-NEXT: IfsVersion: 1.0
+// CHECK-NEXT: Triple:
+// CHECK-NEXT: ObjectFileFormat: ELF
+// CHECK-NEXT: Symbols:
+// CHECK-NEXT: ...
+
+ // ConstructorUsingShadowDecl
+struct Base { Base(int); };
+struct Derived : public Base { using Base::Base; };
diff --git a/clang/test/InterfaceStubs/cxxdeduction-guide.cpp b/clang/test/InterfaceStubs/cxxdeduction-guide.cpp
new file mode 100644
index 0000000000000..f09b9d929ca3e
--- /dev/null
+++ b/clang/test/InterfaceStubs/cxxdeduction-guide.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -o - -emit-interface-stubs -std=c++17 %s | FileCheck %s
+
+// CHECK:      --- !experimental-ifs-v1
+// CHECK-NEXT: IfsVersion: 1.0
+// CHECK-NEXT: Triple:
+// CHECK-NEXT: ObjectFileFormat: ELF
+// CHECK-NEXT: Symbols:
+// CHECK-NEXT: ...
+
+// CXXDeductionGuideDecl
+template<typename T> struct A { A(); A(T); };
+A() -> A<int>;
diff --git a/clang/test/InterfaceStubs/namespace-alias.cpp b/clang/test/InterfaceStubs/namespace-alias.cpp
new file mode 100644
index 0000000000000..6a7f27c9b7b0a
--- /dev/null
+++ b/clang/test/InterfaceStubs/namespace-alias.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
+
+// CHECK:      --- !experimental-ifs-v1
+// CHECK-NEXT: IfsVersion: 1.0
+// CHECK-NEXT: Triple:
+// CHECK-NEXT: ObjectFileFormat: ELF
+// CHECK-NEXT: Symbols:
+// CHECK-NEXT: ...
+
+// NamespaceAliasDecl
+namespace NS { }
+namespace B = NS;
diff --git a/clang/test/InterfaceStubs/unresolved-using-typename.cpp b/clang/test/InterfaceStubs/unresolved-using-typename.cpp
new file mode 100644
index 0000000000000..e6afc781412a1
--- /dev/null
+++ b/clang/test/InterfaceStubs/unresolved-using-typename.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -o - -emit-interface-stubs %s | FileCheck %s
+
+// CHECK:      --- !experimental-ifs-v1
+// CHECK-NEXT: IfsVersion: 1.0
+// CHECK-NEXT: Triple:
+// CHECK-NEXT: ObjectFileFormat: ELF
+// CHECK-NEXT: Symbols:
+// CHECK-NEXT: ...
+
+// UnresolvedUsingTypenameDecl
+template<typename T> class C1 { using ReprType = unsigned; };
+template<typename T> class C2 : public C1<T> { using typename C1<T>::Repr; };
diff --git a/clang/test/Misc/warning-wall.c b/clang/test/Misc/warning-wall.c
index fadcceefe297e..2b27b67eafa17 100644
--- a/clang/test/Misc/warning-wall.c
+++ b/clang/test/Misc/warning-wall.c
@@ -90,6 +90,7 @@ CHECK-NEXT:    -Wparentheses-equality
 CHECK-NEXT:    -Wdangling-else
 CHECK-NEXT:  -Wswitch
 CHECK-NEXT:  -Wswitch-bool
+CHECK-NEXT:  -Wmisleading-indentation
 
 
 CHECK-NOT:-W
diff --git a/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/a.h b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/a.h
new file mode 100644
index 0000000000000..8adab29eafc76
--- /dev/null
+++ b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/a.h
@@ -0,0 +1,2 @@
+
+constexpr const int& LETemp = 0;
diff --git a/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/b.h b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/b.h
new file mode 100644
index 0000000000000..2bd1b096d6073
--- /dev/null
+++ b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/b.h
@@ -0,0 +1,4 @@
+
+#include "a.h"
+
+constexpr const int* PtrTemp1 = &LETemp;
diff --git a/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/c.h b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/c.h
new file mode 100644
index 0000000000000..b023eebca49c2
--- /dev/null
+++ b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/c.h
@@ -0,0 +1,4 @@
+
+#include "a.h"
+
+constexpr const int* PtrTemp2 = &LETemp;
diff --git a/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/module.modulemap b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/module.modulemap
new file mode 100644
index 0000000000000..1339d627a44af
--- /dev/null
+++ b/clang/test/Modules/Inputs/merge-lifetime-extended-temporary/module.modulemap
@@ -0,0 +1,14 @@
+module "a" {
+  export *
+  header "a.h"
+}
+
+module "b" {
+  export *
+  header "b.h"
+}
+
+module "c" {
+  export *
+  header "c.h"
+}
diff --git a/clang/test/Modules/crash-vfs-ivfsoverlay.m b/clang/test/Modules/crash-vfs-ivfsoverlay.m
index 00992aa19fad6..d2d2ccbd2546b 100644
--- a/clang/test/Modules/crash-vfs-ivfsoverlay.m
+++ b/clang/test/Modules/crash-vfs-ivfsoverlay.m
@@ -3,7 +3,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/m
 // RUN: cp %S/../VFS/Inputs/actual_module2.map %t/actual_module2.map
-// RUN: sed -e "s@INPUT_DIR@%/t@g" -e "s@OUT_DIR@%/t/example@g" \
+// RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}@g" -e "s@OUT_DIR@%{/t:regex_replacement}/example@g" \
 // RUN:   %S/../VFS/Inputs/vfsoverlay2.yaml > %t/srcvfs.yaml
 
 // RUN: env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
diff --git a/clang/test/Modules/double-quotes.m b/clang/test/Modules/double-quotes.m
index 4ce712ccc6c54..99187fc26654e 100644
--- a/clang/test/Modules/double-quotes.m
+++ b/clang/test/Modules/double-quotes.m
@@ -4,7 +4,7 @@
 // RUN: %hmaptool write %S/Inputs/double-quotes/a.hmap.json %t/a.hmap
 // RUN: %hmaptool write %S/Inputs/double-quotes/x.hmap.json %t/x.hmap
 
-// RUN: sed -e "s@TEST_DIR@%/S/Inputs/double-quotes@g" \
+// RUN: sed -e "s@TEST_DIR@%{/S:regex_replacement}/Inputs/double-quotes@g" \
 // RUN:   %S/Inputs/double-quotes/z.yaml > %t/z.yaml
 
 // The output with and without modules should be the same
diff --git a/clang/test/Modules/framework-public-includes-private.m b/clang/test/Modules/framework-public-includes-private.m
index 0f1e3a242a158..37c43e9a6390b 100644
--- a/clang/test/Modules/framework-public-includes-private.m
+++ b/clang/test/Modules/framework-public-includes-private.m
@@ -4,7 +4,7 @@
 // RUN: %hmaptool write %S/Inputs/framework-public-includes-private/a.hmap.json %t/a.hmap
 // RUN: %hmaptool write %S/Inputs/framework-public-includes-private/z.hmap.json %t/z.hmap
 
-// RUN: sed -e "s@TEST_DIR@%/S/Inputs/framework-public-includes-private@g" \
+// RUN: sed -e "s@TEST_DIR@%{/S:regex_replacement}/Inputs/framework-public-includes-private@g" \
 // RUN:   %S/Inputs/framework-public-includes-private/z.yaml > %t/z.yaml
 
 // The output with and without modules should be the same, without modules first.
diff --git a/clang/test/Modules/merge-lifetime-extended-temporary.cpp b/clang/test/Modules/merge-lifetime-extended-temporary.cpp
new file mode 100644
index 0000000000000..36db948b2c4ef
--- /dev/null
+++ b/clang/test/Modules/merge-lifetime-extended-temporary.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -x c++ -I%S/Inputs/merge-lifetime-extended-temporary -verify -std=c++11 %s -DORDER=1
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -x c++ -I%S/Inputs/merge-lifetime-extended-temporary -verify -std=c++11 %s -DORDER=2
+
+// expected-no-diagnostics
+#if ORDER == 1
+#include "c.h"
+#include "b.h"
+#else
+#include "b.h"
+#include "c.h"
+#endif
+
+static_assert(PtrTemp1 == &LETemp, "");
+static_assert(PtrTemp1 == PtrTemp2, "");
diff --git a/clang/test/OpenMP/declare_reduction_codegen.cpp b/clang/test/OpenMP/declare_reduction_codegen.cpp
index 95c607d83246c..1f6fa2bebea36 100644
--- a/clang/test/OpenMP/declare_reduction_codegen.cpp
+++ b/clang/test/OpenMP/declare_reduction_codegen.cpp
@@ -85,9 +85,8 @@ SSS<int> d;
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 
-// CHECK: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
-// CHECK-LOAD: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
-void init(SSS<int> &lhs, SSS<int> &rhs) {}
+template <typename T>
+void init(T &lhs, T &rhs) {}
 
 #pragma omp declare reduction(fun : SSS < int > : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
 // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias %0, [[SSS_INT]]* noalias %1)
@@ -95,7 +94,7 @@ void init(SSS<int> &lhs, SSS<int> &rhs) {}
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias %0, [[SSS_INT]]* noalias %1)
-// CHECK: call {{.*}}void [[INIT]](
+// CHECK: call {{.*}}void @_Z4initI3SSSIiEEvRT_S3_(
 // CHECK-NEXT: ret void
 // CHECK-NEXT: }
 
@@ -104,10 +103,13 @@ void init(SSS<int> &lhs, SSS<int> &rhs) {}
 // CHECK-LOAD-NEXT: ret void
 // CHECK-LOAD-NEXT: }
 // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias %0, [[SSS_INT]]* noalias %1)
-// CHECK-LOAD: call {{.*}}void [[INIT]](
+// CHECK-LOAD: call {{.*}}void @_Z4initI3SSSIiEEvRT_S3_(
 // CHECK-LOAD-NEXT: ret void
 // CHECK-LOAD-NEXT: }
 
+// CHECK: define {{.*}}void @_Z4initI3SSSIiEEvRT_S3_(%struct.SSS* {{.+}}, %struct.SSS* {{.+}})
+// CHECK-LOAD: define {{.*}}void @_Z4initI3SSSIiEEvRT_S3_(%struct.SSS* {{.+}}, %struct.SSS* {{.+}})
+
 template <typename T>
 T foo(T a) {
 #pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = 15 * omp_orig)
diff --git a/clang/test/OpenMP/declare_reduction_codegen_in_templates.cpp b/clang/test/OpenMP/declare_reduction_codegen_in_templates.cpp
new file mode 100644
index 0000000000000..0409c02191445
--- /dev/null
+++ b/clang/test/OpenMP/declare_reduction_codegen_in_templates.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++17 -emit-llvm %s -triple x86_64-linux -fexceptions -fcxx-exceptions -o - -femit-all-decls -disable-llvm-passes | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++17 -triple x86_64-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -femit-all-decls -disable-llvm-passes
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-linux -fexceptions -fcxx-exceptions -std=c++17 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-passes | FileCheck %s
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++17 -emit-llvm %s -triple x86_64-linux -fexceptions -fcxx-exceptions -o - -femit-all-decls -disable-llvm-passes | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++17 -triple x86_64-linux -fexceptions -fcxx-exceptions -emit-pch -o %t %s -femit-all-decls -disable-llvm-passes
+// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-linux -fexceptions -fcxx-exceptions -std=c++17 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-passes | FileCheck --check-prefix SIMD-ONLY0 %s
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+// expected-no-diagnostics
+
+// CHECK: call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @{{.+}}, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[STD_D:%.+]]*)* [[OUTLINED:@.+]] to void (i32*, i32*, ...)*), [[STD_D]]* %{{.+}})
+
+// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, [[STD_D]]* {{.+}})
+// CHECK: call i32 @__kmpc_reduce_nowait(%struct.ident_t*
+
+#ifndef HEADER
+#define HEADER
+
+typedef long unsigned a;
+namespace std {
+template <class> class initializer_list {
+  const int *b;
+  a c;
+};
+template <typename, typename> class d {};
+template <typename e> class f {
+public:
+  f(initializer_list<e>);
+};
+} // namespace std
+template <class g, class h> void foo(g, h) {
+  std::d<a, double> i;
+#pragma omp declare reduction(j : std::d <a, double> : []{}())
+#pragma omp parallel reduction(j : i)
+  ;
+}
+void k() {
+  std::f<int> l{};
+  std::f<int> m{2};
+  foo(l, m);
+}
+
+#endif // HEADER
diff --git a/clang/test/OpenMP/parallel_codegen.cpp b/clang/test/OpenMP/parallel_codegen.cpp
index bacb2c6b06eef..498a0590b51de 100644
--- a/clang/test/OpenMP/parallel_codegen.cpp
+++ b/clang/test/OpenMP/parallel_codegen.cpp
@@ -109,7 +109,7 @@ int main (int argc, char **argv) {
 // CHECK-DEBUG-NEXT:  ret i32 0
 // CHECK-DEBUG-NEXT:  }
 
-// CHECK:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i8*** dereferenceable({{4|8}}) %argc, i{{64|32}} %{{.+}})
+// CHECK:       define internal {{.*}}void [[OMP_OUTLINED]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i8*** dereferenceable({{4|8}}) %argc, i{{64|32}}{{.*}} %{{.+}})
 // CHECK:       store i8*** %argc, i8**** [[ARGC_PTR_ADDR:%.+]],
 // CHECK:       [[ARGC_REF:%.+]] = load i8***, i8**** [[ARGC_PTR_ADDR]]
 // CHECK:       [[ARGC:%.+]] = load i8**, i8*** [[ARGC_REF]]
diff --git a/clang/test/OpenMP/parallel_for_simd_codegen.cpp b/clang/test/OpenMP/parallel_for_simd_codegen.cpp
index 9585bf293695c..01f2b4c42a243 100644
--- a/clang/test/OpenMP/parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/parallel_for_simd_codegen.cpp
@@ -1,14 +1,24 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=OMP45 --check-prefix=CHECK
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s --check-prefix=OMP50 --check-prefix=CHECK
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+
 // RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
-// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50  -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50  -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50  -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -debug-info-kind=limited -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50  -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
 // expected-no-diagnostics
+// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 #ifndef HEADER
 #define HEADER
 
@@ -75,7 +85,7 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_VAR:%[^,]+]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
-// CHECK: call void @__kmpc_dispatch_init_4(%struct.ident_t* {{.+}}, i32 %{{.+}}, i32 35, i32 0, i32 8, i32 1, i32 1)
+// CHECK: call void @__kmpc_dispatch_init_4(%struct.ident_t* {{.+}}, i32 %{{.+}}, i32 {{35|1073741859}}, i32 0, i32 8, i32 1, i32 1)
 // CHECK: [[NEXT:%.+]] = call i32 @__kmpc_dispatch_next_4(%struct.ident_t* {{.+}}, i32 %{{.+}}, i32* %{{.+}}, i32* [[LB:%.+]], i32* [[UB:%.+]], i32* %{{.+}})
 // CHECK: [[COND:%.+]] = icmp ne i32 [[NEXT]], 0
 // CHECK: br i1 [[COND]], label %[[CONT:.+]], label %[[END:.+]]
@@ -386,6 +396,51 @@ void inst_templ1() {
   templ1<float,2> (a, z);
 }
 
+// OMP50: call void @__kmpc_for_static_init_8(%struct.ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 34, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 1)
+// OMP50: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// OMP50: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
+// OMP50: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
+// OMP50: [[TRUE]]:
+// OMP50: br label %[[SWITCH:[^,]+]]
+// OMP50: [[FALSE]]:
+// OMP50: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// OMP50: br label %[[SWITCH]]
+// OMP50: [[SWITCH]]:
+// OMP50: [[UP:%.+]] = phi i64 [ 15, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
+// OMP50: store i64 [[UP]], i64* [[UB]],
+// OMP50: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// OMP50: store i64 [[LB_VAL]], i64* [[T1_OMP_IV:%[^,]+]],
+
+// ...
+// OMP50: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP50-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
+// OMP50-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
+// OMP50-NEXT: br i1 [[CMP1]], label %[[T1_BODY:.+]], label %[[T1_END:[^,]+]]
+// OMP50: [[T1_BODY]]:
+// Loop counters i and j updates:
+// OMP50: [[IV1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP50-NEXT: [[I_1:%.+]] = sdiv i64 [[IV1]], 4
+// OMP50-NEXT: [[I_1_MUL1:%.+]] = mul nsw i64 [[I_1]], 1
+// OMP50-NEXT: [[I_1_ADD0:%.+]] = add nsw i64 0, [[I_1_MUL1]]
+// OMP50-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
+// OMP50-NEXT: store i32 [[I_2]], i32*
+// OMP50: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP50: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP50-NEXT: [[DIV_1:%.+]] = sdiv i64 [[IV2_1]], 4
+// OMP50-NEXT: [[MUL_1:%.+]] = mul nsw i64 [[DIV_1]], 4
+// OMP50-NEXT: [[J_1:%.+]] = sub nsw i64 [[IV2]], [[MUL_1]]
+// OMP50-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
+// OMP50-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
+// OMP50-NEXT: store i64 [[J_2_ADD0]], i64*
+// simd.for.inc:
+// OMP50: [[IV3:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP50-NEXT: [[INC:%.+]] = add nsw i64 [[IV3]], 1
+// OMP50-NEXT: store i64 [[INC]], i64*
+// OMP50-NEXT: br label {{%.+}}
+// OMP50: [[T1_END]]:
+// OMP50: call void @__kmpc_for_static_fini(%struct.ident_t* {{.+}}, i32 %{{.+}})
+// OMP50: ret void
+//
 
 typedef int MyIdx;
 
@@ -674,51 +729,77 @@ void widened(float *a, float *b, float *c, float *d) {
 // CHECK: ret void
 }
 
-// CHECK: call void @__kmpc_for_static_init_8(%struct.ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 34, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 1)
-// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
-// CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
-// CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
-// CHECK: [[TRUE]]:
-// CHECK: br label %[[SWITCH:[^,]+]]
-// CHECK: [[FALSE]]:
-// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
-// CHECK: br label %[[SWITCH]]
-// CHECK: [[SWITCH]]:
-// CHECK: [[UP:%.+]] = phi i64 [ 15, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
-// CHECK: store i64 [[UP]], i64* [[UB]],
-// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
-// CHECK: store i64 [[LB_VAL]], i64* [[T1_OMP_IV:%[^,]+]],
+// CHECK-LABEL: if_clause
+void if_clause(int a) {
+  #pragma omp parallel for simd if(a) schedule(static, 1)
+for (int i = 0; i < 10; ++i);
+}
+// CHECK: call void @__kmpc_for_static_init_4(
+// OMP50: [[COND:%.+]] = trunc i8 %{{.+}} to i1
+// OMP50: br i1 [[COND]], label {{%?}}[[THEN:.+]], label {{%?}}[[ELSE:.+]]
+
+// OMP50: [[THEN]]:
+// OMP45: br label {{.+}}, !llvm.loop ![[VECT:.+]]
+// OMP50: br label {{.+}}, !llvm.loop ![[VECT:.+]]
+// OMP50: [[ELSE]]:
+// OMP50: br label {{.+}}, !llvm.loop ![[NOVECT:.+]]
+// CHECK: call void @__kmpc_for_static_fini(
+
+// OMP45: call void @__kmpc_for_static_init_8(%struct.ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 34, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 1)
+// OMP45: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// OMP45: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
+// OMP45: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
+// OMP45: [[TRUE]]:
+// OMP45: br label %[[SWITCH:[^,]+]]
+// OMP45: [[FALSE]]:
+// OMP45: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// OMP45: br label %[[SWITCH]]
+// OMP45: [[SWITCH]]:
+// OMP45: [[UP:%.+]] = phi i64 [ 15, %[[TRUE]] ], [ [[UB_VAL]], %[[FALSE]] ]
+// OMP45: store i64 [[UP]], i64* [[UB]],
+// OMP45: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// OMP45: store i64 [[LB_VAL]], i64* [[T1_OMP_IV:%[^,]+]],
 
 // ...
-// CHECK: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
-// CHECK-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
-// CHECK-NEXT: br i1 [[CMP1]], label %[[T1_BODY:.+]], label %[[T1_END:[^,]+]]
-// CHECK: [[T1_BODY]]:
+// OMP45: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP45-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
+// OMP45-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
+// OMP45-NEXT: br i1 [[CMP1]], label %[[T1_BODY:.+]], label %[[T1_END:[^,]+]]
+// OMP45: [[T1_BODY]]:
 // Loop counters i and j updates:
-// CHECK: [[IV1:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[I_1:%.+]] = sdiv i64 [[IV1]], 4
-// CHECK-NEXT: [[I_1_MUL1:%.+]] = mul nsw i64 [[I_1]], 1
-// CHECK-NEXT: [[I_1_ADD0:%.+]] = add nsw i64 0, [[I_1_MUL1]]
-// CHECK-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
-// CHECK-NEXT: store i32 [[I_2]], i32*
-// CHECK: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[DIV_1:%.+]] = sdiv i64 [[IV2_1]], 4
-// CHECK-NEXT: [[MUL_1:%.+]] = mul nsw i64 [[DIV_1]], 4
-// CHECK-NEXT: [[J_1:%.+]] = sub nsw i64 [[IV2]], [[MUL_1]]
-// CHECK-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
-// CHECK-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
-// CHECK-NEXT: store i64 [[J_2_ADD0]], i64*
+// OMP45: [[IV1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP45-NEXT: [[I_1:%.+]] = sdiv i64 [[IV1]], 4
+// OMP45-NEXT: [[I_1_MUL1:%.+]] = mul nsw i64 [[I_1]], 1
+// OMP45-NEXT: [[I_1_ADD0:%.+]] = add nsw i64 0, [[I_1_MUL1]]
+// OMP45-NEXT: [[I_2:%.+]] = trunc i64 [[I_1_ADD0]] to i32
+// OMP45-NEXT: store i32 [[I_2]], i32*
+// OMP45: [[IV2:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP45: [[IV2_1:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP45-NEXT: [[DIV_1:%.+]] = sdiv i64 [[IV2_1]], 4
+// OMP45-NEXT: [[MUL_1:%.+]] = mul nsw i64 [[DIV_1]], 4
+// OMP45-NEXT: [[J_1:%.+]] = sub nsw i64 [[IV2]], [[MUL_1]]
+// OMP45-NEXT: [[J_2:%.+]] = mul nsw i64 [[J_1]], 2
+// OMP45-NEXT: [[J_2_ADD0:%.+]] = add nsw i64 0, [[J_2]]
+// OMP45-NEXT: store i64 [[J_2_ADD0]], i64*
 // simd.for.inc:
-// CHECK: [[IV3:%.+]] = load i64, i64* [[T1_OMP_IV]]
-// CHECK-NEXT: [[INC:%.+]] = add nsw i64 [[IV3]], 1
-// CHECK-NEXT: store i64 [[INC]], i64*
-// CHECK-NEXT: br label {{%.+}}
-// CHECK: [[T1_END]]:
-// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* {{.+}}, i32 %{{.+}})
-// CHECK: ret void
+// OMP45: [[IV3:%.+]] = load i64, i64* [[T1_OMP_IV]]
+// OMP45-NEXT: [[INC:%.+]] = add nsw i64 [[IV3]], 1
+// OMP45-NEXT: store i64 [[INC]], i64*
+// OMP45-NEXT: br label {{%.+}}
+// OMP45: [[T1_END]]:
+// OMP45: call void @__kmpc_for_static_fini(%struct.ident_t* {{.+}}, i32 %{{.+}})
+// OMP45: ret void
 //
+
+// OMP45-NOT: !{!"llvm.loop.vectorize.enable", i1 false}
+// OMP45-DAG: ![[VECT]] = distinct !{![[VECT]], ![[VM:.+]]}
+// OMP45-DAG: ![[VM]] = !{!"llvm.loop.vectorize.enable", i1 true}
+// OMP45-NOT: !{!"llvm.loop.vectorize.enable", i1 false}
+// OMP50-DAG: ![[VECT]] = distinct !{![[VECT]], ![[VM:.+]]}
+// OMP50-DAG: ![[VM]] = !{!"llvm.loop.vectorize.enable", i1 true}
+// OMP50-DAG: ![[NOVECT]] = distinct !{![[NOVECT]], ![[NOVM:.+]]}
+// OMP50-DAG: ![[NOVM]] = !{!"llvm.loop.vectorize.enable", i1 false}
+
 // TERM_DEBUG-LABEL: bar
 int bar() {return 0;};
 
diff --git a/clang/test/OpenMP/taskloop_simd_ast_print.cpp b/clang/test/OpenMP/taskloop_simd_ast_print.cpp
index d5403ed06d97f..59144f344949b 100644
--- a/clang/test/OpenMP/taskloop_simd_ast_print.cpp
+++ b/clang/test/OpenMP/taskloop_simd_ast_print.cpp
@@ -1,10 +1,16 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s --check-prefix CHECK --check-prefix OMP45
 // RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s -DOMP5
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print -DOMP5 | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -69,12 +75,17 @@ int main(int argc, char **argv) {
 // CHECK-NEXT: for (int i = 0; i < 2; ++i)
 // CHECK-NEXT: a = 2;
 #pragma omp parallel
+#ifdef OMP5
+#pragma omp taskloop simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) shared(g) if(simd: argc) mergeable priority(argc) simdlen(16) grainsize(argc) reduction(max: a, e)
+#else
 #pragma omp taskloop simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) shared(g) if(argc) mergeable priority(argc) simdlen(16) grainsize(argc) reduction(max: a, e)
+#endif // OMP5
   for (int i = 0; i < 10; ++i)
     for (int j = 0; j < 10; ++j)
       foo();
   // CHECK-NEXT: #pragma omp parallel
-  // CHECK-NEXT: #pragma omp taskloop simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) shared(g) if(argc) mergeable priority(argc) simdlen(16) grainsize(argc) reduction(max: a,e)
+  // OMP50-NEXT: #pragma omp taskloop simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) shared(g) if(simd: argc) mergeable priority(argc) simdlen(16) grainsize(argc) reduction(max: a,e)
+  // OMP45-NEXT: #pragma omp taskloop simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) shared(g) if(argc) mergeable priority(argc) simdlen(16) grainsize(argc) reduction(max: a,e)
   // CHECK-NEXT: for (int i = 0; i < 10; ++i)
   // CHECK-NEXT: for (int j = 0; j < 10; ++j)
   // CHECK-NEXT: foo();
diff --git a/clang/test/OpenMP/taskloop_simd_codegen.cpp b/clang/test/OpenMP/taskloop_simd_codegen.cpp
index 4c84eccb4f856..6b8f3543dfb54 100644
--- a/clang/test/OpenMP/taskloop_simd_codegen.cpp
+++ b/clang/test/OpenMP/taskloop_simd_codegen.cpp
@@ -1,10 +1,16 @@
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=45 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s --check-prefix CHECK --check-prefix OMP45
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fopenmp-version=50 -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix OMP50
 
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=45 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp-simd -fopenmp-version=50 -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck --check-prefix SIMD-ONLY0 %s
 // SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
 // expected-no-diagnostics
 #ifndef HEADER
@@ -45,10 +51,13 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     ;
 // CHECK: call void @__kmpc_taskgroup(%struct.ident_t* [[DEFLOC]], i32 [[GTID]])
-// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// OMP45: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// OMP50: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%struct.ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 80, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
 // CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
-// CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// OMP45: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// OMP50: [[IF_VAL:%.+]] = load i8, i8* %
+// OMP50: [[IF:%.+]] = trunc i8 [[IF_VAL]] to i1
 // CHECK: [[IF_INT:%.+]] = sext i1 [[IF]] to i32
 // CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
 // CHECK: store i64 0, i64* [[DOWN]],
diff --git a/clang/test/Parser/cxx-ambig-decl-expr.cpp b/clang/test/Parser/cxx-ambig-decl-expr.cpp
index 6507eafb74cd7..02857e21f7c3e 100644
--- a/clang/test/Parser/cxx-ambig-decl-expr.cpp
+++ b/clang/test/Parser/cxx-ambig-decl-expr.cpp
@@ -17,3 +17,25 @@ auto (*q)() -> int(*)(unknown); // expected-error {{unknown type name 'unknown'}
 auto (*r)() -> int(*)(unknown + 1); // expected-error {{undeclared identifier 'unknown'}}
 
 int f(unknown const x); // expected-error {{unknown type name 'unknown'}}
+
+// Disambiguating an array declarator from an array subscripting.
+void arr() {
+  int x[] = {1}; // expected-note 2{{previous}}
+
+  // This is array indexing not an array declarator because a comma expression
+  // is not syntactically a constant-expression.
+  int(x[1,1]); // expected-warning 2{{unused}}
+
+  // This is array indexing not an array declaration because a braced-init-list
+  // is not syntactically a constant-expression.
+  int(x[{0}]); // expected-error {{array subscript is not an integer}}
+  struct A {
+    struct Q { int n; };
+    int operator[](Q);
+  } a;
+  int(a[{0}]); // expected-warning {{unused}}
+
+  // These are array declarations.
+  int(x[(1,1)]); // expected-error {{redefinition}}
+  int(x[true ? 1,1 : 1]); // expected-error {{redefinition}}
+}
diff --git a/clang/test/Parser/warn-misleading-indentation.cpp b/clang/test/Parser/warn-misleading-indentation.cpp
new file mode 100644
index 0000000000000..e5ed8bba93c15
--- /dev/null
+++ b/clang/test/Parser/warn-misleading-indentation.cpp
@@ -0,0 +1,208 @@
+// RUN: %clang_cc1 -x c -fsyntax-only -verify %s
+// RUN: %clang_cc1 -x c -fsyntax-only -verify -Wmisleading-indentation -DWITH_WARN %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify -Wall -Wno-unused -DWITH_WARN -DCXX17 %s
+// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify -Wall -Wno-unused -Wno-misleading-indentation -DCXX17 %s
+
+#ifndef WITH_WARN
+// expected-no-diagnostics
+#endif
+
+void f0(int i) {
+  if (i)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i = i + 1;
+    int x = 0;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+  return;
+#ifdef CXX17
+  if constexpr (false)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i = 0;
+    i += 1;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+#endif
+}
+
+void f1(int i) {
+  for (;i;)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i = i + 1;
+    i *= 2;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'for'}}
+#endif
+  return;
+}
+
+void f2(int i) {
+  while (i)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i = i + 1; i *= 2;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'while'}}
+#endif
+  return;
+}
+
+void f3(int i) {
+  if (i)
+    i = i + 1;
+  else
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i *= 2;
+    const int x = 0;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'else'}}
+#endif
+}
+
+#ifdef CXX17
+struct Range {
+  int *begin() {return nullptr;}
+  int *end() {return nullptr;}
+};
+#endif
+
+void f4(int i) {
+  if (i)
+  i *= 2;
+  return;
+  if (i)
+    i *= 2;
+    ;
+  if (i)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i *= 2;
+    typedef int Int;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+#ifdef CXX17
+  Range R;
+  for (auto e : R)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+    i *= 2;
+    using Int2 = int;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'for'}}
+#endif
+#endif
+}
+
+int bar(void);
+
+int foo(int* dst)
+{   
+    if (dst)
+       return
+    bar();
+  if (dst)
+    dst = dst + \
+    bar();
+  return 0;
+}
+
+void g(int i) {
+  if (1)
+    i = 2;
+  else
+         if (i == 3)
+#ifdef WITH_WARN
+// expected-note@-3 {{here}}
+#endif
+    i = 4;
+    i = 5;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+}
+
+// Or this
+#define TEST i = 5
+void g0(int i) {
+  if (1)
+    i = 2;
+  else
+    i = 5;
+    TEST;
+}
+
+void g1(int i) {
+  if (1)
+    i = 2;
+  else if (i == 3)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+      i = 4;
+      i = 5;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+}
+
+void g2(int i) {
+  if (1)
+    i = 2;
+  else
+    if (i == 3)
+    {i = 4;}
+    i = 5;
+}
+
+void g6(int i) {
+        if (1)
+                if (i == 3)
+#ifdef WITH_WARN
+// expected-note@-2 {{here}}
+#endif
+                        i = 4;
+                        i = 5;
+#ifdef WITH_WARN
+// expected-warning@-2 {{misleading indentation; statement is not part of the previous 'if'}}
+#endif
+}
+
+void g7(int i) {
+  if (1)
+    i = 4;
+#ifdef TEST1
+#endif
+    i = 5;
+}
+
+void a1(int i) { if (1) i = 4; return; }
+
+void a2(int i) {
+  {
+    if (1)
+      i = 4;
+      }
+  return;
+}
+
+void a3(int i) {
+  if (1)
+    {
+    i = 4;
+    }
+    return;
+}
\ No newline at end of file
diff --git a/clang/test/Preprocessor/file_test.c b/clang/test/Preprocessor/file_test.c
new file mode 100644
index 0000000000000..3788db6eb090e
--- /dev/null
+++ b/clang/test/Preprocessor/file_test.c
@@ -0,0 +1,23 @@
+// XFAIL: system-windows
+// RUN: %clang -E -ffile-prefix-map=%p=/UNLIKELY_PATH/empty -c -o - %s | FileCheck %s
+// RUN: %clang -E -fmacro-prefix-map=%p=/UNLIKELY_PATH/empty -c -o - %s | FileCheck %s
+// RUN: %clang -E -fmacro-prefix-map=%p=/UNLIKELY_PATH=empty -c -o - %s | FileCheck %s -check-prefix CHECK-EVIL
+// RUN: %clang -E -fmacro-prefix-map=%p/= -c -o - %s | FileCheck %s --check-prefix CHECK-REMOVE
+
+filename: __FILE__
+#include "file_test.h"
+
+// CHECK: filename: "/UNLIKELY_PATH/empty{{/|\\\\}}file_test.c"
+// CHECK: filename: "/UNLIKELY_PATH/empty{{/|\\\\}}file_test.h"
+// CHECK: basefile: "/UNLIKELY_PATH/empty{{/|\\\\}}file_test.c"
+// CHECK-NOT: filename:
+
+// CHECK-EVIL: filename: "/UNLIKELY_PATH=empty{{/|\\\\}}file_test.c"
+// CHECK-EVIL: filename: "/UNLIKELY_PATH=empty{{/|\\\\}}file_test.h"
+// CHECK-EVIL: basefile: "/UNLIKELY_PATH=empty{{/|\\\\}}file_test.c"
+// CHECK-EVIL-NOT: filename:
+
+// CHECK-REMOVE: filename: "file_test.c"
+// CHECK-REMOVE: filename: "file_test.h"
+// CHECK-REMOVE: basefile: "file_test.c"
+// CHECK-REMOVE-NOT: filename:
diff --git a/clang/test/Preprocessor/file_test.h b/clang/test/Preprocessor/file_test.h
new file mode 100644
index 0000000000000..c289e5c836280
--- /dev/null
+++ b/clang/test/Preprocessor/file_test.h
@@ -0,0 +1,2 @@
+filename: __FILE__
+basefile: __BASE_FILE__
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 6034c085024dd..928ca6f4fa8d9 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -47,7 +47,7 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++2a -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2A
 // CHECK-MS-CPP2A: #define _MSC_VER 1900
-// CHECK-MS-CPP2A: #define _MSVC_LANG 201704L
+// CHECK-MS-CPP2A: #define _MSVC_LANG 201705L
 
 // RUN: %clang_cc1 -triple i386-windows %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-WIN
diff --git a/clang/test/Sema/arm-global-regs.c b/clang/test/Sema/arm-global-regs.c
deleted file mode 100644
index 753cb60e68388..0000000000000
--- a/clang/test/Sema/arm-global-regs.c
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 -ffreestanding -fsyntax-only -target-feature +reserve-r9  -verify -triple arm-arm-none-eabi %s
-
-// Check a small subset of valid and invalid global register variable declarations.
-// Also check that for global register variables without -ffixed-reg options it throws an error.
-
-register unsigned arm_r3 __asm("r3"); //expected-error {{register 'r3' unsuitable for global register variables on this target}}
-
-register unsigned arm_r12 __asm("r12"); //expected-error {{register 'r12' unsuitable for global register variables on this target}}
-
-register unsigned arm_r5 __asm("r5"); //expected-error {{register 'r5' unsuitable for global register variables on this target}}
-
-register unsigned arm_r9 __asm("r9");
-
-register unsigned arm_r6 __asm("r6"); //expected-error {{-ffixed-r6 is required for global named register variable declaration}}
-
-register unsigned arm_r7 __asm("r7"); //expected-error {{-ffixed-r7 is required for global named register variable declaration}}
-
-register unsigned *parm_r7 __asm("r7"); //expected-error {{-ffixed-r7 is required for global named register variable declaration}}
-
-register unsigned arm_sp __asm("sp");
diff --git a/clang/test/Sema/builtins-mips-features.c b/clang/test/Sema/builtins-mips-features.c
new file mode 100644
index 0000000000000..4ea36d7f24dc0
--- /dev/null
+++ b/clang/test/Sema/builtins-mips-features.c
@@ -0,0 +1,37 @@
+// REQUIRES: mips-registered-target
+// RUN: %clang_cc1 -triple mips64 -fsyntax-only -verify %s
+
+typedef signed char v4i8 __attribute__ ((vector_size(4)));
+typedef signed char v4q7 __attribute__ ((vector_size(4)));
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+
+void dsp() {
+  v4i8 a;
+  void* p;
+
+  // expected-error@+1 {{this builtin requires 'dsp' ASE, please use -mdsp}}
+  __builtin_mips_addu_qb(a, a);
+  // expected-error@+1 {{this builtin requires 'dsp' ASE, please use -mdsp}}
+  __builtin_mips_lwx(p, 32);
+}
+
+void dspr2() {
+  v4i8 a;
+  v4q7 b;
+
+  // expected-error@+1 {{this builtin requires 'dsp r2' ASE, please use -mdspr2}}
+  __builtin_mips_absq_s_qb(b);
+  // expected-error@+1 {{this builtin requires 'dsp r2' ASE, please use -mdspr2}}
+  __builtin_mips_subuh_r_qb(a, a);
+}
+
+void msa() {
+  v16i8 a;
+  v16u8 b;
+
+  // expected-error@+1 {{this builtin requires 'msa' ASE, please use -mmsa}}
+  __builtin_msa_add_a_b(a, a);
+  // expected-error@+1 {{this builtin requires 'msa' ASE, please use -mmsa}}
+  __builtin_msa_xori_b(b, 5);
+}
diff --git a/clang/test/Sema/eval-info.c b/clang/test/Sema/eval-info.c
new file mode 100644
index 0000000000000..7f4de4b908207
--- /dev/null
+++ b/clang/test/Sema/eval-info.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -fsyntax-only -triple x86_64-unknown-windows-msvc -verify
+
+// expected-no-diagnostics
+
+// Make sure the new constant interpolator is not enabled unintentionally
+// to cause assertion.
+typedef enum x {
+  a = 1,
+} x;
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 8ba7686944468..3319d5aa2db8c 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -450,3 +450,8 @@ MyIntPointer handleDerivedToBaseCast1(MySpecialIntPointer ptr) {
 MyIntPointer handleDerivedToBaseCast2(MyOwnerIntPointer ptr) {
   return ptr; // expected-warning {{address of stack memory associated with parameter 'ptr' returned}}
 }
+
+std::vector<int>::iterator noFalsePositiveWithVectorOfPointers() {
+  std::vector<std::vector<int>::iterator> iters;
+  return iters.at(0);
+}
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index 8db705dcdc67f..c2e443b9bec10 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -18,6 +18,7 @@ namespace std {
 [[nodiscard]] void *operator new(std::size_t, std::align_val_t, const std::nothrow_t&) noexcept;
 [[nodiscard]] void *operator new[](std::size_t, const std::nothrow_t&) noexcept;
 [[nodiscard]] void *operator new[](std::size_t, std::align_val_t, const std::nothrow_t&) noexcept;
+[[nodiscard]] void *operator new[](std::size_t, std::align_val_t);
 void operator delete(void*, const std::nothrow_t&) noexcept;
 void operator delete(void*, std::align_val_t, const std::nothrow_t&) noexcept;
 void operator delete[](void*, const std::nothrow_t&) noexcept;
@@ -1050,7 +1051,7 @@ namespace dynamic_alloc {
     // Ensure that we don't try to evaluate these for overflow and crash. These
     // are all value-dependent expressions.
     p = new char[n];
-    p = new (n) char[n];
+    p = new ((std::align_val_t)n) char[n];
     p = new char(n);
   }
 }
diff --git a/clang/test/SemaCXX/deprecated-copy.cpp b/clang/test/SemaCXX/deprecated-copy.cpp
new file mode 100644
index 0000000000000..4d3e798d912ba
--- /dev/null
+++ b/clang/test/SemaCXX/deprecated-copy.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -std=c++11 %s -Wdeprecated-copy -verify
+// RUN: %clang_cc1 -std=c++11 %s -Wdeprecated-copy-dtor -DDEPRECATED_COPY_DTOR -verify
+// RUN: %clang_cc1 -std=c++11 %s -Wextra -verify
+
+#ifdef DEPRECATED_COPY_DTOR
+struct A {
+  int *ptr;
+  ~A() { delete ptr; } // expected-warning {{definition of implicit copy constructor for 'A' is deprecated because it has a user-declared destructor}}
+};
+
+void foo() {
+  A a{};
+  A b = a; // expected-note {{implicit copy constructor for 'A' first required here}}
+}
+#else
+struct B {
+  B &operator=(const B &); // expected-warning {{definition of implicit copy constructor for 'B' is deprecated because it has a user-declared copy assignment operator}}
+};
+
+void bar() {
+  B b1, b2(b1); // expected-note {{implicit copy constructor for 'B' first required here}}
+}
+#endif
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index 5fff855102fb1..0f4edc4d1f343 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -fblocks %s
+// RUN: %clang_cc1 -std=c++14 -Wno-unused-value -fsyntax-only -verify -verify=expected-cxx14 -fblocks %s
+// RUN: %clang_cc1 -std=c++17 -Wno-unused-value -fsyntax-only -verify -fblocks %s
 
 namespace std { class type_info; };
 
@@ -12,6 +13,7 @@ namespace ExplicitCapture {
 
     void ImplicitThisCapture() {
       [](){(void)Member;}; // expected-error {{'this' cannot be implicitly captured in this context}}
+      const int var = [](){(void)Member; return 0;}(); // expected-error {{'this' cannot be implicitly captured in this context}}
       [&](){(void)Member;};
 
       [this](){(void)Member;};
@@ -105,7 +107,7 @@ namespace SpecialMembers {
     a = static_cast<decltype(a)&&>(a); // expected-error {{copy assignment operator is implicitly deleted}}
   }
   struct P {
-    P(const P&) = delete; // expected-note 2{{deleted here}}
+    P(const P&) = delete; //expected-note {{deleted here}} // expected-cxx14-note {{deleted here}}
   };
   struct Q {
     ~Q() = delete; // expected-note {{deleted here}}
@@ -118,8 +120,8 @@ namespace SpecialMembers {
   };
   void g(P &p, Q &q, R &r) {
     // FIXME: The note attached to the second error here is just amazingly bad.
-    auto pp = [p]{}; // expected-error {{deleted constructor}} expected-error {{deleted copy constructor of '(lambda}}
-    // expected-note@-1 {{copy constructor of '' is implicitly deleted because field '' has a deleted copy constructor}}
+    auto pp = [p]{}; // expected-error {{deleted constructor}} expected-cxx14-error {{deleted copy constructor of '(lambda}}
+    // expected-cxx14-note@-1 {{copy constructor of '' is implicitly deleted because field '' has a deleted copy constructor}}
     auto qq = [q]{}; // expected-error {{deleted function}} expected-note {{because}}
 
     auto a = [r]{}; // expected-note 2{{here}}
@@ -365,7 +367,7 @@ namespace PR18128 {
     int (*f())[true ? 1 : ([=]{ return n; }(), 0)];
     // expected-error@-1 {{non-local lambda expression cannot have a capture-default}}
     // expected-error@-2 {{invalid use of non-static data member 'n'}}
-    // expected-error@-3 {{a lambda expression may not appear inside of a constant expression}}
+    // expected-cxx14-error@-3 {{a lambda expression may not appear inside of a constant expression}}
     int g(int k = ([=]{ return n; }(), 0));
     // expected-error@-1 {{non-local lambda expression cannot have a capture-default}}
     // expected-error@-2 {{invalid use of non-static data member 'n'}}
@@ -596,8 +598,13 @@ namespace ConversionOperatorDoesNotHaveDeducedReturnType {
     using ExpectedTypeU = void (*)(T&);
 
   struct X {
+#if __cplusplus > 201402L
+    friend constexpr auto T::operator()(int) const;
+    friend constexpr T::operator ExpectedTypeT() const noexcept;
+#else
     friend auto T::operator()(int) const;
     friend T::operator ExpectedTypeT() const;
+#endif
 
     // FIXME: The first of these should match. The second should not.
     template<typename T>
diff --git a/clang/test/SemaOpenCL/address-spaces.cl b/clang/test/SemaOpenCL/address-spaces.cl
index b39a30372fbfd..a28069470177c 100644
--- a/clang/test/SemaOpenCL/address-spaces.cl
+++ b/clang/test/SemaOpenCL/address-spaces.cl
@@ -242,18 +242,25 @@ void func_multiple_addr(void) {
   __private private_int_t *var6;// expected-warning {{multiple identical address spaces specified for type}}
 }
 
+void func_with_array_param(const unsigned data[16]);
+
+__kernel void k() {
+  unsigned data[16];
+  func_with_array_param(data);
+}
+
 void func_multiple_addr2(void) {
   typedef __private int private_int_t;
-  __private __attribute__((ocl_global)) int var1;   // expected-error {{multiple address spaces specified for type}}
-  __private __attribute__((ocl_global)) int *var2;  // expected-error {{multiple address spaces specified for type}}
-  __attribute__((ocl_global)) private_int_t var3;   // expected-error {{multiple address spaces specified for type}}
-  __attribute__((ocl_global)) private_int_t *var4;  // expected-error {{multiple address spaces specified for type}}
-  __attribute__((ocl_private)) private_int_t var5;  // expected-warning {{multiple identical address spaces specified for type}}
-  __attribute__((ocl_private)) private_int_t *var6; // expected-warning {{multiple identical address spaces specified for type}}
+  __private __attribute__((opencl_global)) int var1;   // expected-error {{multiple address spaces specified for type}}
+  __private __attribute__((opencl_global)) int *var2;  // expected-error {{multiple address spaces specified for type}}
+  __attribute__((opencl_global)) private_int_t var3;   // expected-error {{multiple address spaces specified for type}}
+  __attribute__((opencl_global)) private_int_t *var4;  // expected-error {{multiple address spaces specified for type}}
+  __attribute__((opencl_private)) private_int_t var5;  // expected-warning {{multiple identical address spaces specified for type}}
+  __attribute__((opencl_private)) private_int_t *var6; // expected-warning {{multiple identical address spaces specified for type}}
 #if __OPENCL_CPP_VERSION__
-  [[clang::ocl_private]] __global int var7;         // expected-error {{multiple address spaces specified for type}}
-  [[clang::ocl_private]] __global int *var8;        // expected-error {{multiple address spaces specified for type}}
-  [[clang::ocl_private]] private_int_t var9;        // expected-warning {{multiple identical address spaces specified for type}}
-  [[clang::ocl_private]] private_int_t *var10;      // expected-warning {{multiple identical address spaces specified for type}}
+  [[clang::opencl_private]] __global int var7;         // expected-error {{multiple address spaces specified for type}}
+  [[clang::opencl_private]] __global int *var8;        // expected-error {{multiple address spaces specified for type}}
+  [[clang::opencl_private]] private_int_t var9;        // expected-warning {{multiple identical address spaces specified for type}}
+  [[clang::opencl_private]] private_int_t *var10;      // expected-warning {{multiple identical address spaces specified for type}}
 #endif // !__OPENCL_CPP_VERSION__
 }
diff --git a/clang/test/SemaOpenCL/event_t.cl b/clang/test/SemaOpenCL/event_t.cl
index e7daf88576cc5..ab7f09170e9cf 100644
--- a/clang/test/SemaOpenCL/event_t.cl
+++ b/clang/test/SemaOpenCL/event_t.cl
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
 
-event_t glb_evt; // expected-error {{the 'event_t' type cannot be used to declare a program scope variable}}
+event_t glb_evt; // expected-error {{the 'event_t' type cannot be used to declare a program scope variable}} expected-error{{program scope variable must reside in constant address space}}
 
 constant struct evt_s {
   event_t evt; // expected-error {{the 'event_t' type cannot be used to declare a structure or union field}}
@@ -10,7 +10,7 @@ void foo(event_t evt); // expected-note {{passing argument to parameter 'evt' he
 
 void kernel ker(event_t argevt) { // expected-error {{'event_t' cannot be used as the type of a kernel parameter}}
   event_t e;
-  constant event_t const_evt; // expected-error {{the event_t type can only be used with __private address space qualifier}}
+  constant event_t const_evt; // expected-error {{the event_t type can only be used with __private address space qualifier}} expected-error{{variable in constant address space must be initialized}}
   foo(e);
   foo(0);
   foo(5); // expected-error {{passing 'int' to parameter of incompatible type 'event_t'}}
diff --git a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
index 97a01a1fe9311..589d04c64e82d 100644
--- a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
+++ b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl
@@ -32,6 +32,7 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 typedef half half4 __attribute__((ext_vector_type(4)));
 typedef int int2 __attribute__((ext_vector_type(2)));
 typedef int int4 __attribute__((ext_vector_type(4)));
+typedef uint uint4 __attribute__((ext_vector_type(4)));
 typedef long long2 __attribute__((ext_vector_type(2)));
 #endif
 
@@ -67,6 +68,13 @@ char4 test_int(char c, char4 c4) {
   return max(c4, c);
 }
 
+kernel void basic_vector_misc(float4 a) {
+  float4 res;
+  uint4 mask = (uint4)(1, 2, 3, 4);
+
+  res = shuffle(a, mask);
+}
+
 kernel void basic_image_readonly(read_only image2d_t image_read_only_image2d) {
   int2 i2;
   sampler_t sampler;
diff --git a/clang/test/SemaOpenCL/invalid-block.cl b/clang/test/SemaOpenCL/invalid-block.cl
index 5d6dc380a37a1..7cbcea96d0acf 100644
--- a/clang/test/SemaOpenCL/invalid-block.cl
+++ b/clang/test/SemaOpenCL/invalid-block.cl
@@ -58,11 +58,11 @@ void f5(int i) {
               : bl2(i);     // expected-error {{block type cannot be used as expression in ternary expression in OpenCL}}
 }
 // A block pointer type and all pointer operations are disallowed
-void f6(bl2_t *bl_ptr) { // expected-error{{pointer to type '__generic bl2_t' (aka 'int (__generic ^const __generic)(int)') is invalid in OpenCL}}
+void f6(bl2_t *bl_ptr) { // expected-error{{pointer to type 'bl2_t' (aka 'int (__generic ^const)(int)') is invalid in OpenCL}}
   bl2_t bl = ^(int i) {
     return 1;
   };
-  bl2_t *p; // expected-error {{pointer to type '__generic bl2_t' (aka 'int (__generic ^const __generic)(int)') is invalid in OpenCL}}
+  bl2_t *p; // expected-error {{pointer to type 'bl2_t' (aka 'int (__generic ^const)(int)') is invalid in OpenCL}}
   *bl;      // expected-error {{invalid argument type 'bl2_t' (aka 'int (__generic ^const)(int)') to unary expression}}
   &bl;      // expected-error {{invalid argument type 'bl2_t' (aka 'int (__generic ^const)(int)') to unary expression}}
 }
diff --git a/clang/test/SemaOpenCL/invalid-pipes-cl2.0.cl b/clang/test/SemaOpenCL/invalid-pipes-cl2.0.cl
index 69fa2b6da823f..de1b4f8858fa0 100644
--- a/clang/test/SemaOpenCL/invalid-pipes-cl2.0.cl
+++ b/clang/test/SemaOpenCL/invalid-pipes-cl2.0.cl
@@ -4,7 +4,7 @@
 global pipe int gp;            // expected-error {{type '__global read_only pipe int' can only be used as a function parameter in OpenCL}}
 global reserve_id_t rid;          // expected-error {{the '__global reserve_id_t' type cannot be used to declare a program scope variable}}
 
-extern pipe write_only int get_pipe(); // expected-error-re{{type '__global write_only pipe int ({{(void)?}})' can only be used as a function parameter in OpenCL}}
+extern pipe write_only int get_pipe(); // expected-error-re{{type '__global write_only pipe int ({{(void)?}})' can only be used as a function parameter in OpenCL}} expected-error{{'write_only' attribute only applies to parameters and typedefs}}
 
 kernel void test_invalid_reserved_id(reserve_id_t ID) { // expected-error {{'reserve_id_t' cannot be used as the type of a kernel parameter}}
 }
diff --git a/clang/test/SemaOpenCL/sampler_t.cl b/clang/test/SemaOpenCL/sampler_t.cl
index fe9d997c89607..888e973cc31d8 100644
--- a/clang/test/SemaOpenCL/sampler_t.cl
+++ b/clang/test/SemaOpenCL/sampler_t.cl
@@ -48,6 +48,9 @@ constant struct sampler_s {
 sampler_t bad(void); //expected-error{{declaring function return value of type 'sampler_t' is not allowed}}
 
 sampler_t global_nonconst_smp = 0; // expected-error {{global sampler requires a const or constant address space qualifier}}
+#ifdef CHECK_SAMPLER_VALUE
+// expected-warning@-2{{sampler initializer has invalid Filter Mode bits}}
+#endif
 
 const sampler_t glb_smp10 = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
 const constant sampler_t glb_smp11 = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_NORMALIZED_COORDS_TRUE | CLK_FILTER_LINEAR;
@@ -62,7 +65,7 @@ void kernel ker(sampler_t argsmp) {
 }
 
 #if __OPENCL_C_VERSION__ == 200
-void bad(sampler_t*); // expected-error{{pointer to type '__generic sampler_t' is invalid in OpenCL}}
+void bad(sampler_t *); // expected-error{{pointer to type 'sampler_t' is invalid in OpenCL}}
 #else
 void bad(sampler_t*); // expected-error{{pointer to type 'sampler_t' is invalid in OpenCL}}
 #endif
diff --git a/clang/test/SemaOpenCLCXX/address-space-deduction.cl b/clang/test/SemaOpenCLCXX/address-space-deduction.cl
index ac6b2cabbd0cb..9bffeafb1c2db 100644
--- a/clang/test/SemaOpenCLCXX/address-space-deduction.cl
+++ b/clang/test/SemaOpenCLCXX/address-space-deduction.cl
@@ -65,30 +65,42 @@ template <typename T>
 x3<T>::x3(const x3<T> &t) {}
 
 template <class T>
-T xxx(T *in) {
+T xxx(T *in1, T in2) {
   // This pointer can't be deduced to generic because addr space
   // will be taken from the template argument.
   //CHECK: `-VarDecl {{.*}} i 'T *' cinit
-  T *i = in;
+  T *i = in1;
   T ii;
+  __private T *ptr = &ii;
+  ptr = &in2;
   return *i;
 }
 
 __kernel void test() {
   int foo[10];
-  xxx(&foo[0]);
+  xxx<__private int>(&foo[0], foo[0]);
+  // FIXME: Template param deduction fails here because
+  // temporaries are not in the __private address space.
+  // It is probably reasonable to put them in __private
+  // considering that stack and function params are
+  // implicitly in __private.
+  // However, if temporaries are left in default addr
+  // space we should at least pretty print the __private
+  // addr space. Otherwise diagnostic apprears to be
+  // confusing.
+  //xxx(&foo[0], foo[0]);
 }
 
 // Addr space for pointer/reference to an array
-//CHECK: FunctionDecl {{.*}} t1 'void (const __generic float (&)[2])'
+//CHECK: FunctionDecl {{.*}} t1 'void (const float (__generic &)[2])'
 void t1(const float (&fYZ)[2]);
-//CHECK: FunctionDecl {{.*}} t2 'void (const __generic float (*)[2])'
+//CHECK: FunctionDecl {{.*}} t2 'void (const float (__generic *)[2])'
 void t2(const float (*fYZ)[2]);
-//CHECK: FunctionDecl {{.*}} t3 'void (__generic float (((*)))[2])'
+//CHECK: FunctionDecl {{.*}} t3 'void (float (((__generic *)))[2])'
 void t3(float(((*fYZ)))[2]);
-//CHECK: FunctionDecl {{.*}} t4 'void (__generic float (((*__generic *)))[2])'
+//CHECK: FunctionDecl {{.*}} t4 'void (float (((__generic *__generic *)))[2])'
 void t4(float(((**fYZ)))[2]);
-//CHECK: FunctionDecl {{.*}} t5 'void (__generic float (*__generic (*))[2])'
+//CHECK: FunctionDecl {{.*}} t5 'void (float (__generic *(__generic *))[2])'
 void t5(float (*(*fYZ))[2]);
 
 __kernel void k() {
diff --git a/clang/test/SemaOpenCLCXX/address-space-lambda.cl b/clang/test/SemaOpenCLCXX/address-space-lambda.cl
new file mode 100644
index 0000000000000..cf87bfaeede29
--- /dev/null
+++ b/clang/test/SemaOpenCLCXX/address-space-lambda.cl
@@ -0,0 +1,53 @@
+//RUN: %clang_cc1 %s -cl-std=clc++ -pedantic -ast-dump -verify | FileCheck %s
+
+//CHECK: CXXMethodDecl {{.*}} constexpr operator() 'int (int) const __generic'
+auto glambda = [](auto a) { return a; };
+
+__kernel void test() {
+  int i;
+//CHECK: CXXMethodDecl {{.*}} constexpr operator() 'void () const __generic'
+  auto  llambda = [&]() {i++;};
+  llambda();
+  glambda(1);
+  // Test lambda with default parameters
+//CHECK: CXXMethodDecl {{.*}} constexpr operator() 'void () const __generic'
+  [&] {i++;} ();
+  __constant auto err = [&]() {}; //expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('__constant (lambda at {{.*}})'), parameter type must be 'const __generic (lambda at {{.*}})'}}
+  err();                          //expected-error-re{{no matching function for call to object of type '__constant (lambda at {{.*}})'}}
+  // FIXME: There is very limited addr space functionality
+  // we can test when taking lambda type from the object.
+  // The limitation is due to addr spaces being added to all
+  // objects in OpenCL. Once we add metaprogramming utility
+  // for removing address spaces from a type we can enhance
+  // testing here.
+  (*(__constant decltype(llambda) *)nullptr)(); //expected-error{{multiple address spaces specified for type}}
+  (*(decltype(llambda) *)nullptr)();
+}
+
+__kernel void test_qual() {
+//CHECK: |-CXXMethodDecl {{.*}} constexpr operator() 'void () const'
+  auto priv1 = []() __private {};
+  priv1();
+//CHECK: |-CXXMethodDecl {{.*}} constexpr operator() 'void () const __generic'
+  auto priv2 = []() __generic {};
+  priv2();
+  auto priv3 = []() __global {}; //expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('(lambda at {{.*}})'), parameter type must be 'const __global (lambda at {{.*}})'}} //expected-note{{conversion candidate of type 'void (*)()'}}
+  priv3(); //expected-error{{no matching function for call to object of type}}
+
+  __constant auto const1 = []() __private{}; //expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('__constant (lambda at {{.*}})'), parameter type must be 'const (lambda at {{.*}}'}} //expected-note{{conversion candidate of type 'void (*)()'}}
+  const1(); //expected-error{{no matching function for call to object of type '__constant (lambda at}}
+  __constant auto const2 = []() __generic{}; //expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('__constant (lambda at {{.*}})'), parameter type must be 'const __generic (lambda at {{.*}}'}} //expected-note{{conversion candidate of type 'void (*)()'}}
+  const2(); //expected-error{{no matching function for call to object of type '__constant (lambda at}}
+//CHECK: |-CXXMethodDecl {{.*}} constexpr operator() 'void () const __constant'
+  __constant auto const3 = []() __constant{};
+  const3();
+
+  [&] () __global {} (); //expected-error{{no matching function for call to object of type '(lambda at}} expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('(lambda at {{.*}})'), parameter type must be 'const __global (lambda at {{.*}})'}}
+  [&] () __private {} (); //expected-error{{no matching function for call to object of type '(lambda at}} expected-note-re{{candidate function not viable: address space mismatch in 'this' argument ('(lambda at {{.*}})'), parameter type must be 'const (lambda at {{.*}})'}}
+
+  [&] __private {} (); //expected-error{{lambda requires '()' before attribute specifier}} expected-error{{expected body of lambda expression}}
+
+  [&] () mutable __private {} ();
+  [&] () __private mutable {} (); //expected-error{{expected body of lambda expression}}
+}
+
diff --git a/clang/test/SemaOpenCLCXX/addrspace-auto.cl b/clang/test/SemaOpenCLCXX/addrspace-auto.cl
new file mode 100644
index 0000000000000..56fd9eb58ddc4
--- /dev/null
+++ b/clang/test/SemaOpenCLCXX/addrspace-auto.cl
@@ -0,0 +1,35 @@
+//RUN: %clang_cc1 %s -cl-std=clc++ -pedantic -ast-dump -verify | FileCheck %s
+
+__constant int i = 1;
+//CHECK: |-VarDecl {{.*}} ai '__global int':'__global int'
+auto ai = i;
+
+kernel void test() {
+  int i;
+  //CHECK: VarDecl {{.*}} ai 'int':'int'
+  auto ai = i;
+
+  constexpr int c = 1;
+  //CHECK: VarDecl {{.*}} used cai '__constant int':'__constant int'
+  __constant auto cai = c;
+  //CHECK: VarDecl {{.*}} aii 'int':'int'
+  auto aii = cai;
+
+  //CHECK: VarDecl {{.*}} ref 'int &'
+  auto &ref = i;
+  //CHECK: VarDecl {{.*}} ptr 'int *'
+  auto *ptr = &i;
+  //CHECK: VarDecl {{.*}} ref_c '__constant int &'
+  auto &ref_c = cai;
+
+  //CHECK: VarDecl {{.*}} ptrptr 'int *__generic *'
+  auto **ptrptr = &ptr;
+  //CHECK: VarDecl {{.*}} refptr 'int *__generic &'
+  auto *&refptr = ptr;
+
+  //CHECK: VarDecl {{.*}} invalid gref '__global auto &'
+  __global auto &gref = i; //expected-error{{variable 'gref' with type '__global auto &' has incompatible initializer of type 'int'}}
+  __local int *ptr_l;
+  //CHECK: VarDecl {{.*}} invalid gptr '__global auto *'
+  __global auto *gptr = ptr_l; //expected-error{{variable 'gptr' with type '__global auto *' has incompatible initializer of type '__local int *'}}
+}
diff --git a/clang/test/SemaOpenCLCXX/restricted.cl b/clang/test/SemaOpenCLCXX/restricted.cl
index fc4938df5bf1e..c00c634073fe7 100644
--- a/clang/test/SemaOpenCLCXX/restricted.cl
+++ b/clang/test/SemaOpenCLCXX/restricted.cl
@@ -32,12 +32,14 @@ B *test_dynamic_cast(B *p) {
 __constant _Thread_local int a = 1;
 // expected-error@-1 {{C++ for OpenCL version 1.0 does not support the '_Thread_local' storage class specifier}}
 // expected-warning@-2 {{'_Thread_local' is a C11 extension}}
-
+// expected-error@-3 {{thread-local storage is not supported for the current target}}
 __constant __thread int b = 2;
 // expected-error@-1 {{C++ for OpenCL version 1.0 does not support the '__thread' storage class specifier}}
+// expected-error@-2 {{thread-local storage is not supported for the current target}}
 kernel void test_storage_classes() {
   register int x;
   // expected-error@-1 {{C++ for OpenCL version 1.0 does not support the 'register' storage class specifier}}
   thread_local int y;
   // expected-error@-1 {{C++ for OpenCL version 1.0 does not support the 'thread_local' storage class specifier}}
+  // expected-error@-2 {{thread-local storage is not supported for the current target}}
 }
diff --git a/clang/test/SemaSYCL/Inputs/sycl.hpp b/clang/test/SemaSYCL/Inputs/sycl.hpp
index 937e2736498a2..43ea3ebbf2f63 100644
--- a/clang/test/SemaSYCL/Inputs/sycl.hpp
+++ b/clang/test/SemaSYCL/Inputs/sycl.hpp
@@ -57,17 +57,17 @@ struct DeviceValueType;
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::global_buffer> {
-  using type = __attribute__((ocl_global)) dataT;
+  using type = __attribute__((opencl_global)) dataT;
 };
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::constant_buffer> {
-  using type = __attribute__((ocl_constant)) dataT;
+  using type = __attribute__((opencl_constant)) dataT;
 };
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::local> {
-  using type = __attribute__((ocl_local)) dataT;
+  using type = __attribute__((opencl_local)) dataT;
 };
 
 template <typename dataT, int dimensions, access::mode accessmode,
diff --git a/clang/test/SemaSYCL/address-space-parameter-conversions.cpp b/clang/test/SemaSYCL/address-space-parameter-conversions.cpp
index d20942daecd79..04f406cb72202 100644
--- a/clang/test/SemaSYCL/address-space-parameter-conversions.cpp
+++ b/clang/test/SemaSYCL/address-space-parameter-conversions.cpp
@@ -2,17 +2,17 @@
 
 void bar(int & Data) {}
 void bar2(int & Data) {}
-void bar(__attribute__((ocl_private)) int  & Data) {}
+void bar(__attribute__((opencl_private)) int  & Data) {}
 void foo(int * Data) {}
 void foo2(int * Data) {}
-void foo(__attribute__((ocl_private)) int * Data) {}
+void foo(__attribute__((opencl_private)) int * Data) {}
 
 template<typename T>
 void tmpl(T *t){}
 
 void usages() {
-  __attribute__((ocl_global)) int *GLOB;
-  __attribute__((ocl_private)) int *PRIV;
+  __attribute__((opencl_global)) int *GLOB;
+  __attribute__((opencl_private)) int *PRIV;
   __attribute__((address_space(3))) int *LOC;
   int *NoAS;
 
diff --git a/clang/test/SemaSYCL/intel-fpga-local.cpp b/clang/test/SemaSYCL/intel-fpga-local.cpp
index b9d515ff2d606..817d0ded3488b 100644
--- a/clang/test/SemaSYCL/intel-fpga-local.cpp
+++ b/clang/test/SemaSYCL/intel-fpga-local.cpp
@@ -494,7 +494,7 @@ void foo1()
 
 //expected-error@+1{{attribute only applies to local non-const variables and non-static data members}}
 [[intelfpga::max_private_copies(8)]]
-__attribute__((ocl_constant)) unsigned int ext_two[64] = { 1, 2, 3 };
+__attribute__((opencl_constant)) unsigned int ext_two[64] = { 1, 2, 3 };
 
 void other2()
 {
diff --git a/clang/test/SemaTemplate/dependent-names.cpp b/clang/test/SemaTemplate/dependent-names.cpp
index 67ef238083f04..a8de159a1d463 100644
--- a/clang/test/SemaTemplate/dependent-names.cpp
+++ b/clang/test/SemaTemplate/dependent-names.cpp
@@ -273,9 +273,6 @@ namespace PR10187 {
       }
       int e[10];
     };
-    void g() {
-      S<int>().f(); // expected-note {{here}}
-    }
   }
 
   namespace A2 {
diff --git a/clang/test/SemaTemplate/enum-argument.cpp b/clang/test/SemaTemplate/enum-argument.cpp
index 7ff4196139901..a79ed8403e9f4 100644
--- a/clang/test/SemaTemplate/enum-argument.cpp
+++ b/clang/test/SemaTemplate/enum-argument.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// expected-no-diagnostics
 
 enum Enum { val = 1 };
 template <Enum v> struct C {
@@ -31,7 +30,7 @@ namespace rdar8020920 {
     unsigned long long bitfield : e0;
 
     void f(int j) {
-      bitfield + j;
+      bitfield + j; // expected-warning {{expression result unused}}
     }
   };
 }
diff --git a/clang/test/SemaTemplate/member-access-expr.cpp b/clang/test/SemaTemplate/member-access-expr.cpp
index 8dba2e68d6562..ef10d72a0ef80 100644
--- a/clang/test/SemaTemplate/member-access-expr.cpp
+++ b/clang/test/SemaTemplate/member-access-expr.cpp
@@ -156,7 +156,7 @@ namespace test6 {
     void get(B **ptr) {
       // It's okay if at some point we figure out how to diagnose this
       // at instantiation time.
-      *ptr = field;
+      *ptr = field; // expected-error {{assigning to 'test6::B *' from incompatible type 'test6::A *}}
     }
   };
 }
diff --git a/clang/test/SemaTemplate/non-integral-switch-cond.cpp b/clang/test/SemaTemplate/non-integral-switch-cond.cpp
new file mode 100644
index 0000000000000..23c8e0ef8d4e1
--- /dev/null
+++ b/clang/test/SemaTemplate/non-integral-switch-cond.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+struct NOT_AN_INTEGRAL_TYPE {};
+
+template <typename T>
+struct foo {
+  NOT_AN_INTEGRAL_TYPE Bad;
+  void run() {
+    switch (Bad) { // expected-error {{statement requires expression of integer type ('NOT_AN_INTEGRAL_TYPE' invalid)}}
+    case 0:
+      break;
+    }
+  }
+};
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
index d73a88777d0c8..7a58dd5dcaeda 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
@@ -393,3 +393,12 @@ namespace PR42362 {
   template<auto (&...F)()> struct Z<F...>::Q {};
   Z<f, f, f>::Q q;
 }
+
+namespace FunctionConversion {
+  struct a { void c(char *) noexcept; };
+  template<void (a::*f)(char*)> void g() {
+    using T = decltype(f);
+    using T = void (a::*)(char*); // (not 'noexcept')
+  }
+  template void g<&a::c>();
+}
diff --git a/clang/test/VFS/external-names.c b/clang/test/VFS/external-names.c
index 1e12c930c35ed..0500611c3e408 100644
--- a/clang/test/VFS/external-names.c
+++ b/clang/test/VFS/external-names.c
@@ -1,5 +1,5 @@
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" -e "s@EXTERNAL_NAMES@true@" %S/Inputs/use-external-names.yaml > %t.external.yaml
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" -e "s@EXTERNAL_NAMES@false@" %S/Inputs/use-external-names.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" -e "s@EXTERNAL_NAMES@true@" %S/Inputs/use-external-names.yaml > %t.external.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" -e "s@EXTERNAL_NAMES@false@" %S/Inputs/use-external-names.yaml > %t.yaml
 
 #include "external-names.h"
 #ifdef REINCLUDE
diff --git a/clang/test/VFS/framework-import.m b/clang/test/VFS/framework-import.m
index 858f1f57fbd15..cd923c1dbe0fb 100644
--- a/clang/test/VFS/framework-import.m
+++ b/clang/test/VFS/framework-import.m
@@ -1,4 +1,4 @@
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -F %t -ivfsoverlay %t.yaml -fsyntax-only %s
 
 #import <SomeFramework/public_header.h>
diff --git a/clang/test/VFS/implicit-include.c b/clang/test/VFS/implicit-include.c
index 654e0a87de0e7..06bff4b962dbc 100644
--- a/clang/test/VFS/implicit-include.c
+++ b/clang/test/VFS/implicit-include.c
@@ -1,4 +1,4 @@
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -ivfsoverlay %t.yaml -I %t -include "not_real.h" -fsyntax-only %s
 
 void foo() {
diff --git a/clang/test/VFS/include-mixed-real-and-virtual.c b/clang/test/VFS/include-mixed-real-and-virtual.c
index e4297c5737d95..b46ee9af99905 100644
--- a/clang/test/VFS/include-mixed-real-and-virtual.c
+++ b/clang/test/VFS/include-mixed-real-and-virtual.c
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: echo "void baz(void);" > %t/real.h
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -ivfsoverlay %t.yaml -I %t -fsyntax-only %s
 
 #include "not_real.h"
diff --git a/clang/test/VFS/include-real-from-virtual.c b/clang/test/VFS/include-real-from-virtual.c
index 3a41c4ea2c767..7398be735c5fe 100644
--- a/clang/test/VFS/include-real-from-virtual.c
+++ b/clang/test/VFS/include-real-from-virtual.c
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: echo "void baz(void);" > %t/real.h
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -ivfsoverlay %t.yaml -I %t -fsyntax-only %s
 
 #include "include_real.h"
diff --git a/clang/test/VFS/include-virtual-from-real.c b/clang/test/VFS/include-virtual-from-real.c
index 0b0d4cd0025a5..b50d5b7292532 100644
--- a/clang/test/VFS/include-virtual-from-real.c
+++ b/clang/test/VFS/include-virtual-from-real.c
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
 // RUN: echo '#include "not_real.h"' > %t/include_not_real.h
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -ivfsoverlay %t.yaml -I %t -fsyntax-only %s
 
 #include "include_not_real.h"
diff --git a/clang/test/VFS/include.c b/clang/test/VFS/include.c
index 16a1bca71a720..a55e73a38178f 100644
--- a/clang/test/VFS/include.c
+++ b/clang/test/VFS/include.c
@@ -1,4 +1,4 @@
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -I %t -ivfsoverlay %t.yaml -fsyntax-only %s
 
 #include "not_real.h"
diff --git a/clang/test/VFS/incomplete-umbrella.m b/clang/test/VFS/incomplete-umbrella.m
index 5b2a1e0b4e1b1..196313927bc08 100644
--- a/clang/test/VFS/incomplete-umbrella.m
+++ b/clang/test/VFS/incomplete-umbrella.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t/Incomplete.framework/Headers
 // RUN: echo '// IncompleteReal.h' > %t/Incomplete.framework/Headers/IncompleteReal.h
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: not %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:     -ivfsoverlay %t.yaml -F %t -fsyntax-only %s 2>&1 | FileCheck %s
 
diff --git a/clang/test/VFS/module-import.m b/clang/test/VFS/module-import.m
index 336a72d31cfa6..25d37bbf0a77b 100644
--- a/clang/test/VFS/module-import.m
+++ b/clang/test/VFS/module-import.m
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ivfsoverlay %t.yaml -I %t -fsyntax-only %s
 
 @import not_real;
@@ -16,7 +16,7 @@ void foo() {
 #endif
 
 // Override the module map (vfsoverlay2 on top)
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay2.yaml > %t2.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay2.yaml > %t2.yaml
 // RUN: %clang_cc1 -Werror -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ivfsoverlay %t.yaml -ivfsoverlay %t2.yaml -I %t -fsyntax-only %s
 
 // vfsoverlay2 not present
diff --git a/clang/test/VFS/module_missing_vfs.m b/clang/test/VFS/module_missing_vfs.m
index 6285ac0649278..3cd8fc2c9eed0 100644
--- a/clang/test/VFS/module_missing_vfs.m
+++ b/clang/test/VFS/module_missing_vfs.m
@@ -5,7 +5,7 @@
 // ERROR: virtual filesystem overlay file '{{.*}}' not found
 // RUN: find %t/mcp -name "A-*.pcm" | count 1
 
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/MissingVFS/vfsoverlay.yaml > %t/vfs.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/MissingVFS/vfsoverlay.yaml > %t/vfs.yaml
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/mcp -I %S/Inputs/MissingVFS %s -fsyntax-only -ivfsoverlay %t/vfs.yaml
 // RUN: find %t/mcp -name "A-*.pcm" | count 1
 
diff --git a/clang/test/VFS/real-path-found-first.m b/clang/test/VFS/real-path-found-first.m
index 8d7d21bf7832e..0d9a6de589fd7 100644
--- a/clang/test/VFS/real-path-found-first.m
+++ b/clang/test/VFS/real-path-found-first.m
@@ -7,7 +7,7 @@
 // RUN: rm -rf %t %t-cache %t.pch
 // RUN: mkdir -p %t/SomeFramework.framework/Modules
 // RUN: cat %S/Inputs/some_frame_module.map > %t/SomeFramework.framework/Modules/module.modulemap
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 
 // Build
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t-cache -F %t \
diff --git a/clang/test/VFS/relative-path.c b/clang/test/VFS/relative-path.c
index fc4ae151d87f7..24313affc69d8 100644
--- a/clang/test/VFS/relative-path.c
+++ b/clang/test/VFS/relative-path.c
@@ -1,6 +1,6 @@
 // RUN: mkdir -p %t
 // RUN: cd %t
-// RUN: sed -e "s@INPUT_DIR@%/S/Inputs@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
+// RUN: sed -e "s@INPUT_DIR@%{/S:regex_replacement}/Inputs@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsoverlay.yaml > %t.yaml
 // RUN: %clang_cc1 -Werror -I . -ivfsoverlay %t.yaml -fsyntax-only %s
 
 #include "not_real.h"
diff --git a/clang/test/VFS/test_nonmodular.c b/clang/test/VFS/test_nonmodular.c
index dbc1f622f2032..faec0e3a51623 100644
--- a/clang/test/VFS/test_nonmodular.c
+++ b/clang/test/VFS/test_nonmodular.c
@@ -3,7 +3,7 @@
 // We can't have module.map inside Inputs/Nonmodular.
 // RUN: cp %S/Inputs/Nonmodular/Nonmodular.modulemap %t/outdir/module.modulemap
 //
-// RUN: sed -e "s@VDIR@%/t/vdir@g" -e "s@IN_DIR@%/S@g" -e "s@OUT_DIR@%/t/outdir@g" %S/Inputs/Nonmodular/nonmodular-headers.yaml > %t/vdir/nonmodular-headers.yaml
+// RUN: sed -e "s@VDIR@%{/t:regex_replacement}/vdir@g" -e "s@IN_DIR@%{/S:regex_replacement}@g" -e "s@OUT_DIR@%{/t:regex_replacement}/outdir@g" %S/Inputs/Nonmodular/nonmodular-headers.yaml > %t/vdir/nonmodular-headers.yaml
 // RUN: %clang_cc1 -fmodule-name=Nonmodular -fmodules -Wnon-modular-include-in-framework-module -verify -fimplicit-module-maps -fmodules-cache-path=%t/cache -ivfsoverlay %t/vdir/nonmodular-headers.yaml -I %S/Inputs -F %t/vdir -fsyntax-only %S/Inputs/Nonmodular/test.c
 
 // expected-no-diagnostics
diff --git a/clang/test/VFS/umbrella-framework-import-skipnonexist.m b/clang/test/VFS/umbrella-framework-import-skipnonexist.m
index 6f536b40a9113..a778e26af162f 100644
--- a/clang/test/VFS/umbrella-framework-import-skipnonexist.m
+++ b/clang/test/VFS/umbrella-framework-import-skipnonexist.m
@@ -4,7 +4,7 @@
 // RUN: mkdir -p %t/vdir %t/outdir %t/cache
 // RUN: cp -R %S/Inputs/Bar.framework %t/outdir/
 //
-// RUN: sed -e "s@VDIR@%/t/vdir@g" -e "s@OUT_DIR@%/t/outdir@g" %S/Inputs/bar-headers.yaml > %t/vdir/bar-headers.yaml
+// RUN: sed -e "s@VDIR@%{/t:regex_replacement}/vdir@g" -e "s@OUT_DIR@%{/t:regex_replacement}/outdir@g" %S/Inputs/bar-headers.yaml > %t/vdir/bar-headers.yaml
 // RUN: rm -f %t/outdir/Bar.framework/Headers/B.h
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -ivfsoverlay %t/vdir/bar-headers.yaml -F %t/vdir -fsyntax-only %s
 
diff --git a/clang/test/VFS/vfsroot-include.c b/clang/test/VFS/vfsroot-include.c
index 2f3ff78bd6e95..2564004ea4b1f 100644
--- a/clang/test/VFS/vfsroot-include.c
+++ b/clang/test/VFS/vfsroot-include.c
@@ -3,7 +3,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: sed -e "s@TEST_DIR@%/S@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsroot.yaml > %t.yaml
+// RUN: sed -e "s@TEST_DIR@%{/S:regex_replacement}@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsroot.yaml > %t.yaml
 // RUN: not %clang_cc1 -Werror -ivfsoverlay %t.yaml -I %S/Inputs -I /direct-vfs-root-files -fsyntax-only /tests/vfsroot-include.c 2>&1 | FileCheck %s
 // The line above tests that the compiler input file is looked up through VFS.
 
diff --git a/clang/test/VFS/vfsroot-module.m b/clang/test/VFS/vfsroot-module.m
index 979c5c2819773..3ad3e19d4b37f 100644
--- a/clang/test/VFS/vfsroot-module.m
+++ b/clang/test/VFS/vfsroot-module.m
@@ -3,7 +3,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: sed -e "s@TEST_DIR@%/S@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsroot.yaml > %t.yaml
+// RUN: sed -e "s@TEST_DIR@%{/S:regex_replacement}@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsroot.yaml > %t.yaml
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fdisable-module-hash -fmodules-cache-path=%t/cache -ivfsoverlay %t.yaml -F %S/Inputs -fsyntax-only /tests/vfsroot-module.m
 
 // Test that a file missing from the VFS root is not found, even if it is
diff --git a/clang/test/VFS/vfsroot-with-overlay.c b/clang/test/VFS/vfsroot-with-overlay.c
index 04a275ed15805..4a2c64cb8734b 100644
--- a/clang/test/VFS/vfsroot-with-overlay.c
+++ b/clang/test/VFS/vfsroot-with-overlay.c
@@ -3,7 +3,7 @@
 
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: sed -e "s@TEST_DIR@%/S@g" -e "s@OUT_DIR@%/t@g" %S/Inputs/vfsroot.yaml > %t.yaml
+// RUN: sed -e "s@TEST_DIR@%{/S:regex_replacement}@g" -e "s@OUT_DIR@%{/t:regex_replacement}@g" %S/Inputs/vfsroot.yaml > %t.yaml
 // RUN: sed -e "s@INPUT_DIR@/indirect-vfs-root-files@g" -e "s@OUT_DIR@/overlay-dir@g" %S/Inputs/vfsoverlay.yaml > %t/vfsoverlay.yaml
 // RUN: %clang_cc1 -Werror -ivfsoverlay %t.yaml -ivfsoverlay /direct-vfs-root-files/vfsoverlay.yaml -I /overlay-dir -fsyntax-only /tests/vfsroot-with-overlay.c
 
diff --git a/clang/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs b/clang/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
index 7443405efad27..26a0af3b55b50 100644
--- a/clang/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
+++ b/clang/tools/clang-format-vs/ClangFormat/ClangFormatPackage.cs
@@ -24,6 +24,7 @@
 using System.Runtime.InteropServices;
 using System.Xml.Linq;
 using System.Linq;
+using System.Text;
 
 namespace LLVM.ClangFormat
 {
@@ -292,8 +293,7 @@ private void FormatSelection(OptionPageGrid options)
             string text = view.TextBuffer.CurrentSnapshot.GetText();
             int start = view.Selection.Start.Position.GetContainingLine().Start.Position;
             int end = view.Selection.End.Position.GetContainingLine().End.Position;
-            int length = end - start;
-            
+
             // clang-format doesn't support formatting a range that starts at the end
             // of the file.
             if (start >= text.Length && text.Length > 0)
@@ -301,7 +301,7 @@ private void FormatSelection(OptionPageGrid options)
             string path = Vsix.GetDocumentParent(view);
             string filePath = Vsix.GetDocumentPath(view);
 
-            RunClangFormatAndApplyReplacements(text, start, length, path, filePath, options, view);
+            RunClangFormatAndApplyReplacements(text, start, end, path, filePath, options, view);
         }
 
         /// <summary>
@@ -336,11 +336,11 @@ private void FormatView(IWpfTextView view, OptionPageGrid options)
             RunClangFormatAndApplyReplacements(text, 0, text.Length, path, filePath, options, view);
         }
 
-        private void RunClangFormatAndApplyReplacements(string text, int offset, int length, string path, string filePath, OptionPageGrid options, IWpfTextView view)
+        private void RunClangFormatAndApplyReplacements(string text, int start, int end, string path, string filePath, OptionPageGrid options, IWpfTextView view)
         {
             try
             {
-                string replacements = RunClangFormat(text, offset, length, path, filePath, options);
+                string replacements = RunClangFormat(text, start, end, path, filePath, options);
                 ApplyClangFormatReplacements(replacements, view);
             }
             catch (Exception e)
@@ -363,9 +363,9 @@ private void RunClangFormatAndApplyReplacements(string text, int offset, int len
         /// <summary>
         /// Runs the given text through clang-format and returns the replacements as XML.
         /// 
-        /// Formats the text range starting at offset of the given length.
+        /// Formats the text in range start and end.
         /// </summary>
-        private static string RunClangFormat(string text, int offset, int length, string path, string filePath, OptionPageGrid options)
+        private static string RunClangFormat(string text, int start, int end, string path, string filePath, OptionPageGrid options)
         {
             string vsixPath = Path.GetDirectoryName(
                 typeof(ClangFormatPackage).Assembly.Location);
@@ -373,6 +373,9 @@ private static string RunClangFormat(string text, int offset, int length, string
             System.Diagnostics.Process process = new System.Diagnostics.Process();
             process.StartInfo.UseShellExecute = false;
             process.StartInfo.FileName = vsixPath + "\\clang-format.exe";
+            char[] chars = text.ToCharArray();
+            int offset = Encoding.UTF8.GetByteCount(chars, 0, start);
+            int length = Encoding.UTF8.GetByteCount(chars, 0, end) - offset;
             // Poor man's escaping - this will not work when quotes are already escaped
             // in the input (but we don't need more).
             string style = options.Style.Replace("\"", "\\\"");
@@ -413,10 +416,11 @@ private static string RunClangFormat(string text, int offset, int length, string
             // 2. We write everything to the standard output - this cannot block, as clang-format
             //    reads the full standard input before analyzing it without writing anything to the
             //    standard output.
-            process.StandardInput.Write(text);
+            StreamWriter utf8Writer = new StreamWriter(process.StandardInput.BaseStream, new UTF8Encoding(false));
+            utf8Writer.Write(text);
             // 3. We notify clang-format that the input is done - after this point clang-format
             //    will start analyzing the input and eventually write the output.
-            process.StandardInput.Close();
+            utf8Writer.Close();
             // 4. We must read clang-format's output before waiting for it to exit; clang-format
             //    will close the channel by exiting.
             string output = process.StandardOutput.ReadToEnd();
@@ -440,13 +444,18 @@ private static void ApplyClangFormatReplacements(string replacements, IWpfTextVi
             if (replacements.Length == 0)
                 return;
 
+            string text = view.TextBuffer.CurrentSnapshot.GetText();
+            byte[] bytes = Encoding.UTF8.GetBytes(text);
+
             var root = XElement.Parse(replacements);
             var edit = view.TextBuffer.CreateEdit();
             foreach (XElement replacement in root.Descendants("replacement"))
             {
+                int offset = int.Parse(replacement.Attribute("offset").Value);
+                int length = int.Parse(replacement.Attribute("length").Value);
                 var span = new Span(
-                    int.Parse(replacement.Attribute("offset").Value),
-                    int.Parse(replacement.Attribute("length").Value));
+                    Encoding.UTF8.GetCharCount(bytes, 0, offset),
+                    Encoding.UTF8.GetCharCount(bytes, offset, length));
                 edit.Replace(span, replacement.Value);
             }
             edit.Apply();
diff --git a/clang/tools/clang-format-vs/README.txt b/clang/tools/clang-format-vs/README.txt
index 84e0b451f018d..2cac5b9af9e3c 100644
--- a/clang/tools/clang-format-vs/README.txt
+++ b/clang/tools/clang-format-vs/README.txt
@@ -10,12 +10,12 @@ the following CMake vars:
 
 - BUILD_CLANG_FORMAT_VS_PLUGIN=ON
 
-- NUGET_EXE_PATH=path/to/nuget_dir (unless nuget.exe is already available in PATH)
+- NUGET_EXE_DIR=path/to/nuget_dir (unless nuget.exe is already available in PATH)
 
 example:
   cd /d C:\code\llvm
   mkdir build & cd build
-  cmake -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DNUGET_EXE_PATH=C:\nuget ..
+  cmake -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DNUGET_EXE_DIR=C:\nuget ..
 
 Once LLVM.sln is generated, build the clang_format_vsix target, which will build
 ClangFormat.sln, the C# extension application.
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 9e4f32da884fe..efafed1063910 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -218,7 +218,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
 
   if (Clang->getFrontendOpts().TimeTrace) {
     llvm::timeTraceProfilerInitialize(
-        Clang->getFrontendOpts().TimeTraceGranularity);
+        Clang->getFrontendOpts().TimeTraceGranularity, Argv0);
   }
   // --print-supported-cpus takes priority over the actual compilation.
   if (Clang->getFrontendOpts().PrintSupportedCPUs)
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 2078e47195226..a8222356db44a 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -3595,6 +3595,7 @@ enum CXErrorCode clang_parseTranslationUnit2(
     const char *const *command_line_args, int num_command_line_args,
     struct CXUnsavedFile *unsaved_files, unsigned num_unsaved_files,
     unsigned options, CXTranslationUnit *out_TU) {
+  noteBottomOfStack();
   SmallVector<const char *, 4> Args;
   Args.push_back("clang");
   Args.append(command_line_args, command_line_args + num_command_line_args);
@@ -3619,6 +3620,7 @@ enum CXErrorCode clang_parseTranslationUnit2FullArgv(
 
   CXErrorCode result = CXError_Failure;
   auto ParseTranslationUnitImpl = [=, &result] {
+    noteBottomOfStack();
     result = clang_parseTranslationUnit_Impl(
         CIdx, source_filename, command_line_args, num_command_line_args,
         llvm::makeArrayRef(unsaved_files, num_unsaved_files), options, out_TU);
@@ -6622,9 +6624,10 @@ void clang_enableStackTraces(void) {
 
 void clang_executeOnThread(void (*fn)(void*), void *user_data,
                            unsigned stack_size) {
-  llvm::llvm_execute_on_thread(
-      fn, user_data,
-      stack_size == 0 ? llvm::None : llvm::Optional<unsigned>(stack_size));
+  llvm::llvm_execute_on_thread(fn, user_data,
+                               stack_size == 0
+                                   ? clang::DesiredStackSize
+                                   : llvm::Optional<unsigned>(stack_size));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/unittests/AST/SourceLocationTest.cpp b/clang/unittests/AST/SourceLocationTest.cpp
index 6b4dddc3850a9..d104497974f18 100644
--- a/clang/unittests/AST/SourceLocationTest.cpp
+++ b/clang/unittests/AST/SourceLocationTest.cpp
@@ -648,6 +648,112 @@ TEST(FunctionDecl, FunctionDeclWithNoExceptSpecification) {
       Language::Lang_CXX11));
 }
 
+class FunctionDeclParametersRangeVerifier : public RangeVerifier<FunctionDecl> {
+protected:
+  SourceRange getRange(const FunctionDecl &Function) override {
+    return Function.getParametersSourceRange();
+  }
+};
+
+TEST(FunctionDeclParameters, FunctionDeclOnlyVariadic) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 8);
+  EXPECT_TRUE(Verifier.match("void f(...);\n", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclVariadic) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 15);
+  EXPECT_TRUE(Verifier.match("void f(int a, ...);\n", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclMacroVariadic) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(2, 8, 1, 18);
+  EXPECT_TRUE(Verifier.match("#define VARIADIC ...\n"
+                             "void f(int a, VARIADIC);\n",
+                             functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclMacroParams) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 16, 2, 20);
+  EXPECT_TRUE(Verifier.match("#define PARAMS int a, int b\n"
+                             "void f(PARAMS, int c);",
+                             functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclSingleParameter) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 12);
+  EXPECT_TRUE(Verifier.match("void f(int a);\n", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, MemberFunctionDecl) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(2, 8, 2, 12);
+  EXPECT_TRUE(Verifier.match("class A{\n"
+                             "void f(int a);\n"
+                             "};",
+                             functionDecl()));
+}
+
+TEST(FunctionDeclParameters, MemberFunctionDeclVariadic) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(2, 8, 2, 15);
+  EXPECT_TRUE(Verifier.match("class A{\n"
+                             "void f(int a, ...);\n"
+                             "};",
+                             functionDecl()));
+}
+
+TEST(FunctionDeclParameters, StaticFunctionDecl) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(2, 15, 2, 19);
+  EXPECT_TRUE(Verifier.match("class A{\n"
+                             "static void f(int a);\n"
+                             "};",
+                             functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclMultipleParameters) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 28);
+  EXPECT_TRUE(
+      Verifier.match("void f(int a, int b, char *c);\n", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclWithDefaultValue) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 16);
+  EXPECT_TRUE(Verifier.match("void f(int a = 5);\n", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclWithVolatile) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 22);
+  EXPECT_TRUE(Verifier.match("void f(volatile int *i);", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclWithConstParam) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 19);
+  EXPECT_TRUE(Verifier.match("void f(const int *i);", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclWithConstVolatileParam) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 28);
+  EXPECT_TRUE(Verifier.match("void f(const volatile int *i);", functionDecl()));
+}
+
+TEST(FunctionDeclParameters, FunctionDeclWithParamAttribute) {
+  FunctionDeclParametersRangeVerifier Verifier;
+  Verifier.expectRange(1, 8, 1, 36);
+  EXPECT_TRUE(Verifier.match("void f(__attribute__((unused)) int a) {}",
+                             functionDecl()));
+}
+
 TEST(CXXMethodDecl, CXXMethodDeclWithThrowSpecification) {
   RangeVerifier<FunctionDecl> Verifier;
   Verifier.expectRange(2, 1, 2, 16);
diff --git a/clang/unittests/Driver/DistroTest.cpp b/clang/unittests/Driver/DistroTest.cpp
index d0c86d1c54c9e..391c0baaadf5c 100644
--- a/clang/unittests/Driver/DistroTest.cpp
+++ b/clang/unittests/Driver/DistroTest.cpp
@@ -44,7 +44,7 @@ TEST(DistroTest, DetectUbuntu) {
                                        "SUPPORT_URL=\"http://help.ubuntu.com/\"\n"
                                        "BUG_REPORT_URL=\"http://bugs.launchpad.net/ubuntu/\"\n"));
 
-  Distro UbuntuTrusty{UbuntuTrustyFileSystem};
+  Distro UbuntuTrusty{UbuntuTrustyFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::UbuntuTrusty), UbuntuTrusty);
   ASSERT_TRUE(UbuntuTrusty.IsUbuntu());
   ASSERT_FALSE(UbuntuTrusty.IsRedhat());
@@ -52,6 +52,9 @@ TEST(DistroTest, DetectUbuntu) {
   ASSERT_FALSE(UbuntuTrusty.IsDebian());
   ASSERT_FALSE(UbuntuTrusty.IsGentoo());
 
+  Distro UbuntuTrusty2{UbuntuTrustyFileSystem, llvm::Triple("unknown-pc-windows")};
+  ASSERT_EQ(Distro(Distro::UnknownDistro), UbuntuTrusty2);
+
   llvm::vfs::InMemoryFileSystem UbuntuYakketyFileSystem;
   UbuntuYakketyFileSystem.addFile("/etc/debian_version", 0,
       llvm::MemoryBuffer::getMemBuffer("stretch/sid\n"));
@@ -74,7 +77,7 @@ TEST(DistroTest, DetectUbuntu) {
                                        "VERSION_CODENAME=yakkety\n"
                                        "UBUNTU_CODENAME=yakkety\n"));
 
-  Distro UbuntuYakkety{UbuntuYakketyFileSystem};
+  Distro UbuntuYakkety{UbuntuYakketyFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::UbuntuYakkety), UbuntuYakkety);
   ASSERT_TRUE(UbuntuYakkety.IsUbuntu());
   ASSERT_FALSE(UbuntuYakkety.IsRedhat());
@@ -109,7 +112,7 @@ TEST(DistroTest, DetectRedhat) {
                                        "REDHAT_SUPPORT_PRODUCT=\"Fedora\"\n"
                                        "REDHAT_SUPPORT_PRODUCT_VERSION=25\n"
                                        "PRIVACY_POLICY_URL=https://fedoraproject.org/wiki/Legal:PrivacyPolicy\n"));
-  Distro Fedora25{Fedora25FileSystem};
+  Distro Fedora25{Fedora25FileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::Fedora), Fedora25);
   ASSERT_FALSE(Fedora25.IsUbuntu());
   ASSERT_TRUE(Fedora25.IsRedhat());
@@ -146,7 +149,7 @@ TEST(DistroTest, DetectRedhat) {
                                        "REDHAT_SUPPORT_PRODUCT=\"centos\"\n"
                                        "REDHAT_SUPPORT_PRODUCT_VERSION=\"7\"\n"));
 
-  Distro CentOS7{CentOS7FileSystem};
+  Distro CentOS7{CentOS7FileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::RHEL7), CentOS7);
   ASSERT_FALSE(CentOS7.IsUbuntu());
   ASSERT_TRUE(CentOS7.IsRedhat());
@@ -174,7 +177,7 @@ TEST(DistroTest, DetectOpenSUSE) {
                                        "HOME_URL=\"https://opensuse.org/\"\n"
                                        "ID_LIKE=\"suse\"\n"));
 
-  Distro OpenSUSELeap421{OpenSUSELeap421FileSystem};
+  Distro OpenSUSELeap421{OpenSUSELeap421FileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::OpenSUSE), OpenSUSELeap421);
   ASSERT_FALSE(OpenSUSELeap421.IsUbuntu());
   ASSERT_FALSE(OpenSUSELeap421.IsRedhat());
@@ -200,7 +203,7 @@ TEST(DistroTest, DetectOpenSUSE) {
                                        "HOME_URL=\"https://opensuse.org/\"\n"
                                        "ID_LIKE=\"suse\"\n"));
 
-  Distro OpenSUSE132{OpenSUSE132FileSystem};
+  Distro OpenSUSE132{OpenSUSE132FileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::OpenSUSE), OpenSUSE132);
   ASSERT_FALSE(OpenSUSE132.IsUbuntu());
   ASSERT_FALSE(OpenSUSE132.IsRedhat());
@@ -217,7 +220,7 @@ TEST(DistroTest, DetectOpenSUSE) {
       llvm::MemoryBuffer::getMemBuffer("LSB_VERSION=\"core-2.0-noarch:core-3.0-noarch:core-2.0-x86_64:core-3.0-x86_64\"\n"));
 
   // SLES10 is unsupported and therefore evaluates to unknown
-  Distro SLES10{SLES10FileSystem};
+  Distro SLES10{SLES10FileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::UnknownDistro), SLES10);
   ASSERT_FALSE(SLES10.IsUbuntu());
   ASSERT_FALSE(SLES10.IsRedhat());
@@ -240,7 +243,7 @@ TEST(DistroTest, DetectDebian) {
                                        "SUPPORT_URL=\"http://www.debian.org/support\"\n"
                                        "BUG_REPORT_URL=\"https://bugs.debian.org/\"\n"));
 
-  Distro DebianJessie{DebianJessieFileSystem};
+  Distro DebianJessie{DebianJessieFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::DebianJessie), DebianJessie);
   ASSERT_FALSE(DebianJessie.IsUbuntu());
   ASSERT_FALSE(DebianJessie.IsRedhat());
@@ -259,7 +262,7 @@ TEST(DistroTest, DetectDebian) {
                                        "SUPPORT_URL=\"http://www.debian.org/support\"\n"
                                        "BUG_REPORT_URL=\"https://bugs.debian.org/\"\n"));
 
-  Distro DebianStretchSid{DebianStretchSidFileSystem};
+  Distro DebianStretchSid{DebianStretchSidFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::DebianStretch), DebianStretchSid);
   ASSERT_FALSE(DebianStretchSid.IsUbuntu());
   ASSERT_FALSE(DebianStretchSid.IsRedhat());
@@ -281,7 +284,7 @@ TEST(DistroTest, DetectExherbo) {
                                        "SUPPORT_URL=\"irc://irc.freenode.net/#exherbo\"\n"
                                        "BUG_REPORT_URL=\"https://bugs.exherbo.org/\"\n"));
 
-  Distro Exherbo{ExherboFileSystem};
+  Distro Exherbo{ExherboFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::Exherbo), Exherbo);
   ASSERT_FALSE(Exherbo.IsUbuntu());
   ASSERT_FALSE(Exherbo.IsRedhat());
@@ -303,7 +306,7 @@ TEST(DistroTest, DetectArchLinux) {
                                        "SUPPORT_URL=\"https://bbs.archlinux.org/\"\n"
                                        "BUG_REPORT_URL=\"https://bugs.archlinux.org/\"\n"));
 
-  Distro ArchLinux{ArchLinuxFileSystem};
+  Distro ArchLinux{ArchLinuxFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::ArchLinux), ArchLinux);
   ASSERT_FALSE(ArchLinux.IsUbuntu());
   ASSERT_FALSE(ArchLinux.IsRedhat());
@@ -328,7 +331,7 @@ TEST(DistroTest, DetectGentoo) {
           "SUPPORT_URL=\"https://www.gentoo.org/support/\"\n"
           "BUG_REPORT_URL=\"https://bugs.gentoo.org/\"\n"));
 
-  Distro Gentoo{GentooFileSystem};
+  Distro Gentoo{GentooFileSystem, llvm::Triple("unknown-pc-linux")};
   ASSERT_EQ(Distro(Distro::Gentoo), Gentoo);
   ASSERT_FALSE(Gentoo.IsUbuntu());
   ASSERT_FALSE(Gentoo.IsRedhat());
@@ -337,4 +340,57 @@ TEST(DistroTest, DetectGentoo) {
   ASSERT_TRUE(Gentoo.IsGentoo());
 }
 
+TEST(DistroTest, DetectWindowsAndCrossCompile) {
+
+  class CountingFileSystem : public llvm::vfs::ProxyFileSystem {
+  public:
+    CountingFileSystem() : ProxyFileSystem(llvm::vfs::getRealFileSystem()) {}
+
+    llvm::ErrorOr<llvm::vfs::Status> status(const llvm::Twine &Path) override {
+      ++Count;
+      return llvm::vfs::ProxyFileSystem::status(Path);
+    }
+
+    llvm::ErrorOr<std::unique_ptr<llvm::vfs::File>>
+    openFileForRead(const llvm::Twine &Path) override {
+      ++Count;
+      return llvm::vfs::ProxyFileSystem::openFileForRead(Path);
+    }
+
+    unsigned Count{};
+  };
+
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> RFS =
+      llvm::vfs::getRealFileSystem();
+  llvm::Triple Host(llvm::sys::getProcessTriple());
+
+  CountingFileSystem CFileSystem;
+  Distro LinuxDistro{CFileSystem, llvm::Triple("unknown-pc-linux")};
+  if (Host.isOSWindows()) {
+    ASSERT_EQ(Distro(Distro::UnknownDistro), LinuxDistro);
+    ASSERT_GT(CFileSystem.Count, 0U);
+  }
+
+  Distro WinDistro{CFileSystem, llvm::Triple("unknown-pc-windows")};
+  ASSERT_EQ(Distro(Distro::UnknownDistro), WinDistro);
+  ASSERT_GT(CFileSystem.Count, 0U);
+
+  // When running on Windows along with a real file system, ensure that no
+  // distro is returned if targeting Linux
+  if (Host.isOSWindows()) {
+    Distro LinuxRealDistro{*RFS, llvm::Triple("unknown-pc-linux")};
+    ASSERT_EQ(Distro(Distro::UnknownDistro), LinuxRealDistro);
+  }
+  // When running on Linux, check if the distro is the same as the host when
+  // targeting Linux
+  if (Host.isOSLinux()) {
+    Distro HostDistro{*RFS, Host};
+    Distro LinuxRealDistro{*RFS, llvm::Triple("unknown-pc-linux")};
+    ASSERT_EQ(HostDistro, LinuxRealDistro);
+  }
+
+  Distro WinRealDistro{*RFS, llvm::Triple("unknown-pc-windows")};
+  ASSERT_EQ(Distro(Distro::UnknownDistro), WinRealDistro);
+}
+
 } // end anonymous namespace
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d89ad44e4577f..069542683c0d9 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -6990,6 +6990,9 @@ TEST_F(FormatTest, UnderstandsUnaryOperators) {
   verifyFormat("int a = /* confusing comment */ -1;");
   // FIXME: The space after 'i' is wrong, but hopefully, this is a rare case.
   verifyFormat("int a = i /* confusing comment */++;");
+
+  verifyFormat("co_yield -1;");
+  verifyFormat("co_return -1;");
 }
 
 TEST_F(FormatTest, DoesNotIndentRelativeToUnaryOperators) {
@@ -12552,6 +12555,7 @@ TEST_F(FormatTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(SpacesInParentheses);
   CHECK_PARSE_BOOL(SpacesInSquareBrackets);
   CHECK_PARSE_BOOL(SpacesInAngles);
+  CHECK_PARSE_BOOL(SpacesInConditionalStatement);
   CHECK_PARSE_BOOL(SpaceInEmptyBlock);
   CHECK_PARSE_BOOL(SpaceInEmptyParentheses);
   CHECK_PARSE_BOOL(SpacesInContainerLiterals);
@@ -14877,6 +14881,22 @@ TEST_F(FormatTest, AmbersandInLamda) {
   verifyFormat("auto lambda = [&a = a]() { a = 2; };", AlignStyle);
 }
 
+ TEST_F(FormatTest, SpacesInConditionalStatement) {
+  FormatStyle Spaces = getLLVMStyle();
+  Spaces.SpacesInConditionalStatement = true;
+  verifyFormat("for ( int i = 0; i; i++ )\n  continue;", Spaces);
+  verifyFormat("if ( !a )\n  return;", Spaces);
+  verifyFormat("if ( a )\n  return;", Spaces);
+  verifyFormat("if constexpr ( a )\n  return;", Spaces);
+  verifyFormat("switch ( a )\ncase 1:\n  return;", Spaces);
+  verifyFormat("while ( a )\n  return;", Spaces);
+  verifyFormat("while ( (a && b) )\n  return;", Spaces);
+  verifyFormat("do {\n} while ( 1 != 0 );", Spaces);
+  // Check that space on the left of "::" is inserted as expected at beginning
+  // of condition.
+  verifyFormat("while ( ::func() )\n  return;", Spaces);
+}
+
 TEST_F(FormatTest, AlternativeOperators) {
   // Test case for ensuring alternate operators are not
   // combined with their right most neighbour.
diff --git a/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp b/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp
index ed44cd86b3e42..b5bba30db78da 100644
--- a/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp
+++ b/clang/unittests/Lex/DependencyDirectivesSourceMinimizerTest.cpp
@@ -328,12 +328,17 @@ TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIfdef) {
   SmallVector<char, 128> Out;
 
   ASSERT_FALSE(minimizeSourceToDependencyDirectives("#ifdef A\n"
+                                                    "void skip();\n"
                                                     "#elif B\n"
                                                     "#elif C\n"
                                                     "#else D\n"
                                                     "#endif\n",
                                                     Out));
-  EXPECT_STREQ("", Out.data());
+  EXPECT_STREQ("#ifdef A\n"
+               "#elif B\n"
+               "#elif C\n"
+               "#endif\n",
+               Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, Pragma) {
@@ -507,6 +512,12 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
   for (auto Source : {
            "#warning \\\n#include <t.h>\n",
            "#error \\\n#include <t.h>\n",
+       }) {
+    ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
+    EXPECT_STREQ("", Out.data());
+  }
+
+  for (auto Source : {
            "#if MACRO\n#warning '\n#endif\n",
            "#if MACRO\n#warning \"\n#endif\n",
            "#if MACRO\n#warning /*\n#endif\n",
@@ -515,7 +526,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) {
            "#if MACRO\n#error /*\n#endif\n",
        }) {
     ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-    EXPECT_STREQ("", Out.data());
+    EXPECT_STREQ("#if MACRO\n#endif\n", Out.data());
   }
 }
 
@@ -543,7 +554,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, CharacterLiteralPrefixL) {
 #include <test.h>
 )";
   ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out));
-  EXPECT_STREQ("#include <test.h>\n", Out.data());
+  EXPECT_STREQ("#if DEBUG\n#endif\n#include <test.h>\n", Out.data());
 }
 
 TEST(MinimizeSourceToDependencyDirectivesTest, CharacterLiteralPrefixU) {
diff --git a/clang/unittests/Tooling/CompilationDatabaseTest.cpp b/clang/unittests/Tooling/CompilationDatabaseTest.cpp
index 87727fe7c9079..91685c0d0c736 100644
--- a/clang/unittests/Tooling/CompilationDatabaseTest.cpp
+++ b/clang/unittests/Tooling/CompilationDatabaseTest.cpp
@@ -859,5 +859,35 @@ TEST_F(TargetAndModeTest, TargetAndMode) {
             "clang++ --driver-mode=g++ bar.cpp -D bar.cpp");
 }
 
+class ExpandResponseFilesTest : public MemDBTest {
+public:
+  ExpandResponseFilesTest() : FS(new llvm::vfs::InMemoryFileSystem) {}
+
+protected:
+  void addFile(StringRef File, StringRef Content) {
+    ASSERT_TRUE(
+        FS->addFile(File, 0, llvm::MemoryBuffer::getMemBufferCopy(Content)));
+  }
+
+  std::string getCommand(llvm::StringRef F) {
+    auto Results = expandResponseFiles(std::make_unique<MemCDB>(Entries), FS)
+                       ->getCompileCommands(path(F));
+    if (Results.empty())
+      return "none";
+    return llvm::join(Results[0].CommandLine, " ");
+  }
+
+  llvm::IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> FS;
+};
+
+TEST_F(ExpandResponseFilesTest, ExpandResponseFiles) {
+  addFile(path(StringRef("rsp1.rsp")), "-Dflag");
+
+  add("foo.cpp", "clang", "@rsp1.rsp");
+  add("bar.cpp", "clang", "-Dflag");
+  EXPECT_EQ(getCommand("foo.cpp"), "clang foo.cpp -D foo.cpp -Dflag");
+  EXPECT_EQ(getCommand("bar.cpp"), "clang bar.cpp -D bar.cpp -Dflag");
+}
+
 } // end namespace tooling
 } // end namespace clang
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index 6ffe2c43dd0ff..2c462d49ee410 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Testing/Support/Annotations.h"
 #include "llvm/Testing/Support/SupportHelpers.h"
+#include "gmock/gmock.h"
 #include <cassert>
 #include <cstdlib>
 #include <gmock/gmock.h>
@@ -663,6 +664,20 @@ TEST_F(TokenBufferTest, SpelledByExpanded) {
               ValueIs(SameRange(findSpelled("not_mapped"))));
 }
 
+TEST_F(TokenBufferTest, ExpandedTokensForRange) {
+  recordTokens(R"cpp(
+    #define SIGN(X) X##_washere
+    A SIGN(B) C SIGN(D) E SIGN(F) G
+  )cpp");
+
+  SourceRange R(findExpanded("C").front().location(),
+                findExpanded("F_washere").front().location());
+  // Sanity check: expanded and spelled tokens are stored separately.
+  EXPECT_THAT(Buffer.expandedTokens(R),
+              SameRange(findExpanded("C D_washere E F_washere")));
+  EXPECT_THAT(Buffer.expandedTokens(SourceRange()), testing::IsEmpty());
+}
+
 TEST_F(TokenBufferTest, ExpansionStartingAt) {
   // Object-like macro expansions.
   recordTokens(R"cpp(
diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp
index c8be48b1361d0..3d30a074ddd82 100644
--- a/clang/unittests/Tooling/Syntax/TreeTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp
@@ -130,7 +130,7 @@ void foo() {}
     )cpp",
           R"txt(
 *: TranslationUnit
-|-TopLevelDeclaration
+|-SimpleDeclaration
 | |-int
 | |-main
 | |-(
@@ -138,7 +138,7 @@ void foo() {}
 | `-CompoundStatement
 |   |-{
 |   `-}
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-foo
   |-(
@@ -157,7 +157,7 @@ int main() {
         )cpp",
           R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-int
   |-main
   |-(
@@ -202,7 +202,7 @@ void test() {
 )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -224,7 +224,7 @@ void test() {
       {"void test() { int a = 10; }",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -232,16 +232,18 @@ void test() {
   `-CompoundStatement
     |-{
     |-DeclarationStatement
-    | |-int
-    | |-a
-    | |-=
-    | |-10
+    | |-SimpleDeclaration
+    | | |-int
+    | | |-a
+    | | |-=
+    | | `-UnknownExpression
+    | |   `-10
     | `-;
     `-}
 )txt"},
       {"void test() { ; }", R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -263,7 +265,7 @@ void test() {
 )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -299,7 +301,7 @@ void test() {
 )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -329,7 +331,7 @@ int test() { return 1; }
       )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-int
   |-test
   |-(
@@ -352,7 +354,7 @@ void test() {
       )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -360,18 +362,21 @@ void test() {
   `-CompoundStatement
     |-{
     |-DeclarationStatement
-    | |-int
-    | |-a
-    | |-[
-    | |-3
-    | |-]
+    | |-SimpleDeclaration
+    | | |-int
+    | | |-a
+    | | |-[
+    | | |-UnknownExpression
+    | | | `-3
+    | | `-]
     | `-;
     |-RangeBasedForStatement
     | |-for
     | |-(
-    | |-int
-    | |-x
-    | |-:
+    | |-SimpleDeclaration
+    | | |-int
+    | | |-x
+    | | `-:
     | |-UnknownExpression
     | | `-a
     | |-)
@@ -384,7 +389,7 @@ void test() {
       // counterpart.
       {"void main() { foo: return 100; }", R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-main
   |-(
@@ -411,7 +416,7 @@ void test() {
     )cpp",
        R"txt(
 *: TranslationUnit
-`-TopLevelDeclaration
+`-SimpleDeclaration
   |-void
   |-test
   |-(
@@ -444,7 +449,70 @@ void test() {
     |   | `-)
     |   `-;
     `-}
-)txt"}};
+)txt"},
+      // Multiple declarators group into a single SimpleDeclaration.
+      {R"cpp(
+      int *a, b;
+  )cpp",
+       R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-int
+  |-*
+  |-a
+  |-,
+  |-b
+  `-;
+  )txt"},
+      {R"cpp(
+    typedef int *a, b;
+  )cpp",
+       R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-typedef
+  |-int
+  |-*
+  |-a
+  |-,
+  |-b
+  `-;
+  )txt"},
+      // Multiple declarators inside a statement.
+      {R"cpp(
+void foo() {
+      int *a, b;
+      typedef int *ta, tb;
+}
+  )cpp",
+       R"txt(
+*: TranslationUnit
+`-SimpleDeclaration
+  |-void
+  |-foo
+  |-(
+  |-)
+  `-CompoundStatement
+    |-{
+    |-DeclarationStatement
+    | |-SimpleDeclaration
+    | | |-int
+    | | |-*
+    | | |-a
+    | | |-,
+    | | `-b
+    | `-;
+    |-DeclarationStatement
+    | |-SimpleDeclaration
+    | | |-typedef
+    | | |-int
+    | | |-*
+    | | |-ta
+    | | |-,
+    | | `-tb
+    | `-;
+    `-}
+  )txt"}};
 
   for (const auto &T : Cases) {
     auto *Root = buildTree(T.first);
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 1ca3b5a3f2249..422188a5f3dd4 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1208,14 +1208,16 @@ Result::Ptr MveEmitter::getCodeForArg(unsigned ArgNum, const Type *ArgType,
   Result::Ptr V =
       std::make_shared<BuiltinArgResult>(ArgNum, isa<PointerType>(ArgType));
 
-  if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
-    if (Promote && ST->isInteger() && ST->sizeInBits() < 32)
+  if (Promote) {
+    if (const auto *ST = dyn_cast<ScalarType>(ArgType)) {
+      if (ST->isInteger() && ST->sizeInBits() < 32)
+        V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
+    } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
       V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-  } else if (const auto *PT = dyn_cast<PredicateType>(ArgType)) {
-    V = std::make_shared<IntCastResult>(getScalarType("u32"), V);
-    V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
-                                            std::vector<const Type *>{PT},
-                                            std::vector<Result::Ptr>{V});
+      V = std::make_shared<IRIntrinsicResult>("arm_mve_pred_i2v",
+                                              std::vector<const Type *>{PT},
+                                              std::vector<Result::Ptr>{V});
+    }
   }
 
   return V;
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index cdf761b00c61c..a0f3fb2ddc089 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -161,11 +161,11 @@ class Type {
         Pointer(false), ScalarForMangling(false), NoManglingQ(false),
         Bitwidth(0), ElementBitwidth(0), NumVectors(0) {}
 
-  Type(TypeSpec TS, char CharMod)
+  Type(TypeSpec TS, StringRef CharMods)
       : TS(std::move(TS)), Kind(Void), Immediate(false),
         Constant(false), Pointer(false), ScalarForMangling(false),
         NoManglingQ(false), Bitwidth(0), ElementBitwidth(0), NumVectors(0) {
-    applyModifier(CharMod);
+    applyModifiers(CharMods);
   }
 
   /// Returns a type representing "void".
@@ -181,13 +181,15 @@ class Type {
   bool noManglingQ() const { return NoManglingQ; }
 
   bool isPointer() const { return Pointer; }
+  bool isValue() const { return !isVoid() && !isPointer(); }
+  bool isScalar() const { return isValue() && NumVectors == 0; }
+  bool isVector() const { return isValue() && NumVectors > 0; }
+  bool isConstPointer() const { return Constant; }
   bool isFloating() const { return Kind == Float; }
   bool isInteger() const { return Kind == SInt || Kind == UInt; }
   bool isPoly() const { return Kind == Poly; }
   bool isSigned() const { return Kind == SInt; }
   bool isImmediate() const { return Immediate; }
-  bool isScalar() const { return NumVectors == 0; }
-  bool isVector() const { return NumVectors > 0; }
   bool isFloat() const { return isFloating() && ElementBitwidth == 32; }
   bool isDouble() const { return isFloating() && ElementBitwidth == 64; }
   bool isHalf() const { return isFloating() && ElementBitwidth == 16; }
@@ -205,11 +207,11 @@ class Type {
   // Mutator functions
   //
   void makeUnsigned() {
-    assert(isInteger() && "not a potentially signed type");
+    assert(!isVoid() && "not a potentially signed type");
     Kind = UInt;
   }
   void makeSigned() {
-    assert(isInteger() && "not a potentially signed type");
+    assert(!isVoid() && "not a potentially signed type");
     Kind = SInt;
   }
 
@@ -267,8 +269,8 @@ class Type {
   /// seen. This is needed by applyModifier as some modifiers
   /// only take effect if the type size was changed by "Q" or "H".
   void applyTypespec(bool &Quad);
-  /// Applies a prototype modifiers to the type.
-  void applyModifier(char Mod);
+  /// Applies prototype modifiers to the type.
+  void applyModifiers(StringRef Mods);
 };
 
 //===----------------------------------------------------------------------===//
@@ -299,8 +301,8 @@ class Intrinsic {
 
   /// The Record this intrinsic was created from.
   Record *R;
-  /// The unmangled name and prototype.
-  std::string Name, Proto;
+  /// The unmangled name.
+  std::string Name;
   /// The input and output typespecs. InTS == OutTS except when
   /// CartesianProductOfTypes is 1 - this is the case for vreinterpret.
   TypeSpec OutTS, InTS;
@@ -323,6 +325,8 @@ class Intrinsic {
 
   /// The types of return value [0] and parameters [1..].
   std::vector<Type> Types;
+  /// The index of the key type passed to CGBuiltin.cpp for polymorphic calls.
+  int PolymorphicKeyType;
   /// The local variables defined.
   std::map<std::string, Variable> Variables;
   /// NeededEarly - set if any other intrinsic depends on this intrinsic.
@@ -358,34 +362,39 @@ class Intrinsic {
   Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
             TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
             StringRef Guard, bool IsUnavailable, bool BigEndianSafe)
-      : R(R), Name(Name.str()), Proto(Proto.str()), OutTS(OutTS), InTS(InTS),
-        CK(CK), Body(Body), Guard(Guard.str()), IsUnavailable(IsUnavailable),
-        BigEndianSafe(BigEndianSafe), NeededEarly(false), UseMacro(false),
-        BaseType(OutTS, 'd'), InBaseType(InTS, 'd'), Emitter(Emitter) {
-    // If this builtin takes an immediate argument, we need to #define it rather
-    // than use a standard declaration, so that SemaChecking can range check
-    // the immediate passed by the user.
-    if (Proto.find('i') != std::string::npos)
-      UseMacro = true;
-
-    // Pointer arguments need to use macros to avoid hiding aligned attributes
-    // from the pointer type.
-    if (Proto.find('p') != std::string::npos ||
-        Proto.find('c') != std::string::npos)
-      UseMacro = true;
-
-    // It is not permitted to pass or return an __fp16 by value, so intrinsics
-    // taking a scalar float16_t must be implemented as macros.
-    if (OutTS.find('h') != std::string::npos &&
-        Proto.find('s') != std::string::npos)
-      UseMacro = true;
-
+      : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body),
+        Guard(Guard.str()), IsUnavailable(IsUnavailable),
+        BigEndianSafe(BigEndianSafe), PolymorphicKeyType(0), NeededEarly(false),
+        UseMacro(false), BaseType(OutTS, "."), InBaseType(InTS, "."),
+        Emitter(Emitter) {
     // Modify the TypeSpec per-argument to get a concrete Type, and create
     // known variables for each.
     // Types[0] is the return value.
-    Types.emplace_back(OutTS, Proto[0]);
-    for (unsigned I = 1; I < Proto.size(); ++I)
-      Types.emplace_back(InTS, Proto[I]);
+    unsigned Pos = 0;
+    Types.emplace_back(OutTS, getNextModifiers(Proto, Pos));
+    StringRef Mods = getNextModifiers(Proto, Pos);
+    while (!Mods.empty()) {
+      Types.emplace_back(InTS, Mods);
+      if (Mods.find("!") != StringRef::npos)
+        PolymorphicKeyType = Types.size() - 1;
+
+      Mods = getNextModifiers(Proto, Pos);
+    }
+
+    for (auto Type : Types) {
+      // If this builtin takes an immediate argument, we need to #define it rather
+      // than use a standard declaration, so that SemaChecking can range check
+      // the immediate passed by the user.
+
+      // Pointer arguments need to use macros to avoid hiding aligned attributes
+      // from the pointer type.
+
+      // It is not permitted to pass or return an __fp16 by value, so intrinsics
+      // taking a scalar float16_t must be implemented as macros.
+      if (Type.isImmediate() || Type.isPointer() ||
+          (Type.isScalar() && Type.isHalf()))
+        UseMacro = true;
+    }
   }
 
   /// Get the Record that this intrinsic is based off.
@@ -401,23 +410,24 @@ class Intrinsic {
 
   /// Return true if the intrinsic takes an immediate operand.
   bool hasImmediate() const {
-    return Proto.find('i') != std::string::npos;
+    return std::any_of(Types.begin(), Types.end(),
+                       [](const Type &T) { return T.isImmediate(); });
   }
 
   /// Return the parameter index of the immediate operand.
   unsigned getImmediateIdx() const {
-    assert(hasImmediate());
-    unsigned Idx = Proto.find('i');
-    assert(Idx > 0 && "Can't return an immediate!");
-    return Idx - 1;
+    for (unsigned Idx = 0; Idx < Types.size(); ++Idx)
+      if (Types[Idx].isImmediate())
+        return Idx - 1;
+    llvm_unreachable("Intrinsic has no immediate");
   }
 
-  unsigned getNumParams() const { return Proto.size() - 1; }
+
+  unsigned getNumParams() const { return Types.size() - 1; }
   Type getReturnType() const { return Types[0]; }
   Type getParamType(unsigned I) const { return Types[I + 1]; }
   Type getBaseType() const { return BaseType; }
-  /// Return the raw prototype string.
-  std::string getProto() const { return Proto; }
+  Type getPolymorphicKeyType() const { return Types[PolymorphicKeyType]; }
 
   /// Return true if the prototype has a scalar argument.
   bool protoHasScalar() const;
@@ -471,6 +481,8 @@ class Intrinsic {
   void indexBody();
 
 private:
+  StringRef getNextModifiers(StringRef Proto, unsigned &Pos) const;
+
   std::string mangleName(std::string Name, ClassKind CK) const;
 
   void initVariables();
@@ -614,10 +626,14 @@ std::string Type::builtin_str() const {
   if (isVoid())
     return "v";
 
-  if (Pointer)
+  if (isPointer()) {
     // All pointers are void pointers.
-    S += "v";
-  else if (isInteger())
+    S = "v";
+    if (isConstPointer())
+      S += "C";
+    S += "*";
+    return S;
+  } else if (isInteger())
     switch (ElementBitwidth) {
     case 8: S += "c"; break;
     case 16: S += "s"; break;
@@ -634,10 +650,11 @@ std::string Type::builtin_str() const {
     default: llvm_unreachable("Unhandled case!");
     }
 
+  // FIXME: NECESSARY???????????????????????????????????????????????????????????????????????
   if (isChar() && !isPointer() && isSigned())
     // Make chars explicitly signed.
     S = "S" + S;
-  else if (!isPointer() && isInteger() && !isSigned())
+  else if (isInteger() && !isSigned())
     S = "U" + S;
 
   // Constant indices are "int", but have the "constant expression" modifier.
@@ -646,11 +663,8 @@ std::string Type::builtin_str() const {
     S = "I" + S;
   }
 
-  if (isScalar()) {
-    if (Constant) S += "C";
-    if (Pointer) S += "*";
+  if (isScalar())
     return S;
-  }
 
   std::string Ret;
   for (unsigned I = 0; I < NumVectors; ++I)
@@ -812,202 +826,77 @@ void Type::applyTypespec(bool &Quad) {
   Bitwidth = Quad ? 128 : 64;
 }
 
-void Type::applyModifier(char Mod) {
+void Type::applyModifiers(StringRef Mods) {
   bool AppliedQuad = false;
   applyTypespec(AppliedQuad);
 
-  switch (Mod) {
-  case 'v':
-    Kind = Void;
-    break;
-  case 't':
-    if (isPoly())
+  for (char Mod : Mods) {
+    switch (Mod) {
+    case '.':
+      break;
+    case 'v':
+      Kind = Void;
+      break;
+    case 'S':
+      Kind = SInt;
+      break;
+    case 'U':
       Kind = UInt;
-    break;
-  case 'b':
-    Kind = UInt;
-    NumVectors = 0;
-    Bitwidth = ElementBitwidth;
-    break;
-  case '$':
-    Kind = SInt;
-    NumVectors = 0;
-    Bitwidth = ElementBitwidth;
-    break;
-  case 'u':
-    Kind = UInt;
-    break;
-  case 'x':
-    assert(!isPoly() && "'u' can't be used with poly types!");
-    Kind = SInt;
-    break;
-  case 'o':
-    Bitwidth = ElementBitwidth = 64;
-    NumVectors = 0;
-    Kind = Float;
-    break;
-  case 'y':
-    Bitwidth = ElementBitwidth = 32;
-    NumVectors = 0;
-    Kind = Float;
-    break;
-  case 'Y':
-    Bitwidth = ElementBitwidth = 16;
-    NumVectors = 0;
-    Kind = Float;
-    break;
-  case 'I':
-    Bitwidth = ElementBitwidth = 32;
-    NumVectors = 0;
-    Kind = SInt;
-    break;
-  case 'L':
-    Bitwidth = ElementBitwidth = 64;
-    NumVectors = 0;
-    Kind = SInt;
-    break;
-  case 'U':
-    Bitwidth = ElementBitwidth = 32;
-    NumVectors = 0;
-    Kind = UInt;
-    break;
-  case 'O':
-    Bitwidth = ElementBitwidth = 64;
-    NumVectors = 0;
-    Kind = UInt;
-    break;
-  case 'f':
-    Kind = Float;
-    ElementBitwidth = 32;
-    break;
-  case 'F':
-    Kind = Float;
-    ElementBitwidth = 64;
-    break;
-  case 'H':
-    Kind = Float;
-    ElementBitwidth = 16;
-    break;
-  case '0':
-    Kind = Float;
-    if (AppliedQuad)
-      Bitwidth /= 2;
-    ElementBitwidth = 16;
-    break;
-  case '1':
-    Kind = Float;
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    ElementBitwidth = 16;
-    break;
-  case 'g':
-    if (AppliedQuad)
-      Bitwidth /= 2;
-    break;
-  case 'j':
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    break;
-  case 'w':
-    ElementBitwidth *= 2;
-    Bitwidth *= 2;
-    break;
-  case 'n':
-    ElementBitwidth *= 2;
-    break;
-  case 'i':
-    Kind = SInt;
-    ElementBitwidth = Bitwidth = 32;
-    NumVectors = 0;
-    Immediate = true;
-    break;
-  case 'l':
-    Kind = UInt;
-    ElementBitwidth = Bitwidth = 64;
-    NumVectors = 0;
-    Immediate = true;
-    break;
-  case 'z':
-    ElementBitwidth /= 2;
-    Bitwidth = ElementBitwidth;
-    NumVectors = 0;
-    break;
-  case 'r':
-    ElementBitwidth *= 2;
-    Bitwidth = ElementBitwidth;
-    NumVectors = 0;
-    break;
-  case 's':
-    Bitwidth = ElementBitwidth;
-    NumVectors = 0;
-    break;
-  case 'k':
-    Bitwidth *= 2;
-    break;
-  case 'c':
-    Constant = true;
-    LLVM_FALLTHROUGH;
-  case 'p':
-    Pointer = true;
-    Bitwidth = ElementBitwidth;
-    NumVectors = 0;
-    break;
-  case 'h':
-    ElementBitwidth /= 2;
-    break;
-  case 'q':
-    ElementBitwidth /= 2;
-    Bitwidth *= 2;
-    break;
-  case 'e':
-    ElementBitwidth /= 2;
-    Kind = UInt;
-    break;
-  case 'm':
-    ElementBitwidth /= 2;
-    Bitwidth /= 2;
-    break;
-  case 'd':
-    break;
-  case '2':
-    NumVectors = 2;
-    break;
-  case '3':
-    NumVectors = 3;
-    break;
-  case '4':
-    NumVectors = 4;
-    break;
-  case 'B':
-    NumVectors = 2;
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    break;
-  case 'C':
-    NumVectors = 3;
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    break;
-  case 'D':
-    NumVectors = 4;
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    break;
-  case '7':
-    if (AppliedQuad)
-      Bitwidth /= 2;
-    ElementBitwidth = 8;
-    break;
-  case '8':
-    ElementBitwidth = 8;
-    break;
-  case '9':
-    if (!AppliedQuad)
-      Bitwidth *= 2;
-    ElementBitwidth = 8;
-    break;
-  default:
-    llvm_unreachable("Unhandled character!");
+      break;
+    case 'F':
+      Kind = Float;
+      break;
+    case 'P':
+      Kind = Poly;
+      break;
+    case '>':
+      assert(ElementBitwidth < 128);
+      ElementBitwidth *= 2;
+      break;
+    case '<':
+      assert(ElementBitwidth > 8);
+      ElementBitwidth /= 2;
+      break;
+    case '1':
+      NumVectors = 0;
+      break;
+    case '2':
+      NumVectors = 2;
+      break;
+    case '3':
+      NumVectors = 3;
+      break;
+    case '4':
+      NumVectors = 4;
+      break;
+    case '*':
+      Pointer = true;
+      break;
+    case 'c':
+      Constant = true;
+      break;
+    case 'Q':
+      Bitwidth = 128;
+      break;
+    case 'q':
+      Bitwidth = 64;
+      break;
+    case 'I':
+      Kind = SInt;
+      ElementBitwidth = Bitwidth = 32;
+      NumVectors = 0;
+      Immediate = true;
+      break;
+    case 'p':
+      if (isPoly())
+        Kind = UInt;
+      break;
+    case '!':
+      // Key type, handled elsewhere.
+      break;
+    default:
+      llvm_unreachable("Unhandled character!");
+    }
   }
 }
 
@@ -1015,6 +904,19 @@ void Type::applyModifier(char Mod) {
 // Intrinsic implementation
 //===----------------------------------------------------------------------===//
 
+StringRef Intrinsic::getNextModifiers(StringRef Proto, unsigned &Pos) const {
+  if (Proto.size() == Pos)
+    return StringRef();
+  else if (Proto[Pos] != '(')
+    return Proto.substr(Pos++, 1);
+
+  size_t Start = Pos + 1;
+  size_t End = Proto.find(')', Start);
+  assert_with_loc(End != StringRef::npos, "unmatched modifier group paren");
+  Pos = End + 1;
+  return Proto.slice(Start, End);
+}
+
 std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   char typeCode = '\0';
   bool printNumber = true;
@@ -1053,17 +955,13 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   return S;
 }
 
-static bool isFloatingPointProtoModifier(char Mod) {
-  return Mod == 'F' || Mod == 'f' || Mod == 'H' || Mod == 'Y' || Mod == 'I';
-}
-
 std::string Intrinsic::getBuiltinTypeStr() {
   ClassKind LocalCK = getClassKind(true);
   std::string S;
 
   Type RetT = getReturnType();
   if ((LocalCK == ClassI || LocalCK == ClassW) && RetT.isScalar() &&
-      !RetT.isFloating() && !RetT.isVoid())
+      !RetT.isFloating())
     RetT.makeInteger(RetT.getElementSizeInBits(), false);
 
   // Since the return value must be one type, return a vector type of the
@@ -1078,7 +976,7 @@ std::string Intrinsic::getBuiltinTypeStr() {
     if (!RetT.isScalar() && RetT.isInteger() && !RetT.isSigned())
       RetT.makeSigned();
 
-    if (LocalCK == ClassB && !RetT.isVoid() && !RetT.isScalar())
+    if (LocalCK == ClassB && RetT.isValue() && !RetT.isScalar())
       // Cast to vector of 8-bit elements.
       RetT.makeInteger(8, true);
 
@@ -1194,7 +1092,7 @@ void Intrinsic::initVariables() {
 
   // Modify the TypeSpec per-argument to get a concrete Type, and create
   // known variables for each.
-  for (unsigned I = 1; I < Proto.size(); ++I) {
+  for (unsigned I = 1; I < Types.size(); ++I) {
     char NameC = '0' + (I - 1);
     std::string Name = "p";
     Name.push_back(NameC);
@@ -1315,7 +1213,7 @@ void Intrinsic::emitShadowedArgs() {
   for (unsigned I = 0; I < getNumParams(); ++I) {
     // Do not create a temporary for an immediate argument.
     // That would defeat the whole point of using a macro!
-    if (hasImmediate() && Proto[I+1] == 'i')
+    if (getParamType(I).isImmediate())
       continue;
     // Do not create a temporary for pointer arguments. The input
     // pointer may have an alignment hint.
@@ -1339,13 +1237,9 @@ void Intrinsic::emitShadowedArgs() {
 }
 
 bool Intrinsic::protoHasScalar() const {
-  return (Proto.find('s') != std::string::npos ||
-          Proto.find('z') != std::string::npos ||
-          Proto.find('r') != std::string::npos ||
-          Proto.find('b') != std::string::npos ||
-          Proto.find('$') != std::string::npos ||
-          Proto.find('y') != std::string::npos ||
-          Proto.find('o') != std::string::npos);
+  return std::any_of(Types.begin(), Types.end(), [](const Type &T) {
+    return T.isScalar() && !T.isImmediate();
+  });
 }
 
 void Intrinsic::emitBodyAsBuiltinCall() {
@@ -1408,13 +1302,7 @@ void Intrinsic::emitBodyAsBuiltinCall() {
 
   // Extra constant integer to hold type class enum for this function, e.g. s8
   if (getClassKind(true) == ClassB) {
-    Type ThisTy = getReturnType();
-    if (Proto[0] == 'v' || isFloatingPointProtoModifier(Proto[0]))
-      ThisTy = getParamType(0);
-    if (ThisTy.isPointer())
-      ThisTy = getParamType(1);
-
-    S += utostr(ThisTy.getNeonEnum());
+    S += utostr(getPolymorphicKeyType().getNeonEnum());
   } else {
     // Remove extraneous ", ".
     S.pop_back();
@@ -2019,9 +1907,9 @@ void NeonEmitter::createIntrinsic(Record *R,
   std::vector<std::pair<TypeSpec, TypeSpec>> NewTypeSpecs;
   for (auto TS : TypeSpecs) {
     if (CartesianProductOfTypes) {
-      Type DefaultT(TS, 'd');
+      Type DefaultT(TS, ".");
       for (auto SrcTS : TypeSpecs) {
-        Type DefaultSrcT(SrcTS, 'd');
+        Type DefaultSrcT(SrcTS, ".");
         if (TS == SrcTS ||
             DefaultSrcT.getSizeInBits() != DefaultT.getSizeInBits())
           continue;
@@ -2101,31 +1989,19 @@ void NeonEmitter::genOverloadTypeCheckCode(raw_ostream &OS,
       continue;
 
     uint64_t Mask = 0ULL;
-    Type Ty = Def->getReturnType();
-    if (Def->getProto()[0] == 'v' ||
-        isFloatingPointProtoModifier(Def->getProto()[0]))
-      Ty = Def->getParamType(0);
-    if (Ty.isPointer())
-      Ty = Def->getParamType(1);
-
-    Mask |= 1ULL << Ty.getNeonEnum();
+    Mask |= 1ULL << Def->getPolymorphicKeyType().getNeonEnum();
 
     // Check if the function has a pointer or const pointer argument.
-    std::string Proto = Def->getProto();
     int PtrArgNum = -1;
     bool HasConstPtr = false;
     for (unsigned I = 0; I < Def->getNumParams(); ++I) {
-      char ArgType = Proto[I + 1];
-      if (ArgType == 'c') {
-        HasConstPtr = true;
+      const auto &Type = Def->getParamType(I);
+      if (Type.isPointer()) {
         PtrArgNum = I;
-        break;
-      }
-      if (ArgType == 'p') {
-        PtrArgNum = I;
-        break;
+        HasConstPtr = Type.isConstPointer();
       }
     }
+
     // For sret builtins, adjust the pointer argument index.
     if (PtrArgNum >= 0 && Def->getReturnType().getNumVectors() > 1)
       PtrArgNum += 1;
@@ -2349,7 +2225,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   bool InIfdef = false;
   for (auto &TS : TDTypeVec) {
     bool IsA64 = false;
-    Type T(TS, 'd');
+    Type T(TS, ".");
     if (T.isDouble() || (T.isPoly() && T.getElementSizeInBits() == 64))
       IsA64 = true;
 
@@ -2382,7 +2258,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   for (unsigned NumMembers = 2; NumMembers <= 4; ++NumMembers) {
     for (auto &TS : TDTypeVec) {
       bool IsA64 = false;
-      Type T(TS, 'd');
+      Type T(TS, ".");
       if (T.isDouble() || (T.isPoly() && T.getElementSizeInBits() == 64))
         IsA64 = true;
 
@@ -2395,8 +2271,8 @@ void NeonEmitter::run(raw_ostream &OS) {
         InIfdef = true;
       }
 
-      char M = '2' + (NumMembers - 2);
-      Type VT(TS, M);
+      const char Mods[] = { static_cast<char>('2' + (NumMembers - 2)), 0};
+      Type VT(TS, Mods);
       OS << "typedef struct " << VT.str() << " {\n";
       OS << "  " << T.str() << " val";
       OS << "[" << NumMembers << "]";
diff --git a/clang/utils/convert_arm_neon.py b/clang/utils/convert_arm_neon.py
new file mode 100644
index 0000000000000..c4b3645294573
--- /dev/null
+++ b/clang/utils/convert_arm_neon.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+
+# This script was committed on 20/11/2019 and it would probably make sense to remove
+# it after the next release branches.
+
+# This script is pipe based and converts an arm_neon.td (or arm_fp16.td) file
+# using the old single-char type modifiers to an equivalent new-style form where
+# each modifier is orthogonal and they can be composed.
+#
+# It was used to directly generate the .td files on master, so if you have any
+# local additions I would suggest implementing any modifiers here, and running
+# it over your entire pre-merge .td files rather than trying to resolve any
+# conflicts manually.
+
+import re, sys
+MOD_MAP = {
+    'v': 'v',
+    'x': 'S',
+    'u': 'U',
+    'd': '.',
+    'g': 'q',
+    'j': 'Q',
+    'w': '>Q',
+    'n': '>',
+    'h': '<',
+    'q': '<Q',
+    'e': '<U',
+    'm': '<q',
+    'i': 'I',
+    'l': 'IU>',
+    's': '1',
+    'z': '1<',
+    'r': '1>',
+    'b': '1U',
+    '$': '1S',
+    'k': 'Q',
+    '2': '2',
+    '3': '3',
+    '4': '4',
+    'B': '2Q',
+    'C': '3Q',
+    'D': '4Q',
+    'p': '*',
+    'c': 'c*',
+    '7': '<<q',
+    '8': '<<',
+    '9': '<<Q',
+    't': 'p'
+    }
+
+
+def typespec_elt_size(typespec):
+    if 'c' in typespec:
+        return 8
+    elif 's' in typespec or 'h' in typespec:
+        return 16
+    elif 'i' in typespec or 'f' in typespec:
+        return 32
+    elif 'l' in typespec or 'd' in typespec:
+        return 64
+    elif 'k' in typespec:
+        return 128
+
+def get_resize(cur, desired):
+    res = ''
+    while cur < desired:
+        res += '>'
+        cur *= 2
+    while cur > desired:
+        res += '<'
+        cur /= 2
+    return res
+
+
+def remap_protocol(proto, typespec, name):
+    key_type = 0
+
+    # Conversions like to see the integer type so they know signedness.
+    if 'vcvt' in name and '_f' in name and name != 'vcvt_f32_f64' and name != 'vcvt_f64_f32':
+        key_type = 1
+    default_width = typespec_elt_size(typespec)
+    inconsistent_width = False
+    for elt in typespec:
+        new_width = typespec_elt_size(elt)
+        if new_width and new_width != default_width:
+            inconsistent_width = True
+
+    res = ''
+    for i, c in enumerate(proto):
+        # void and pointers make for bad discriminators in CGBuiltin.cpp.
+        if c in 'vcp':
+                key_type += 1
+
+        if c in MOD_MAP:
+            cur_mod = MOD_MAP[c]
+        elif inconsistent_width:
+            # Otherwise it's a fixed output width modifier.
+            sys.stderr.write(f'warning: {name} uses fixed output size but has inconsistent input widths: {proto} {typespec}\n')
+
+        if c == 'Y':
+            # y: scalar of half float
+            resize = get_resize(default_width, 16)
+            cur_mod = f'1F{resize}'
+        elif c == 'y':
+            # y: scalar of float
+            resize = get_resize(default_width, 32)
+            cur_mod = f'1F{resize}'
+        elif c == 'o':
+            # o: scalar of double
+            resize = get_resize(default_width, 64)
+            cur_mod = f'1F{resize}'
+        elif c == 'I':
+            # I: scalar of 32-bit signed
+            resize = get_resize(default_width, 32)
+            cur_mod = f'1S{resize}'
+        elif c == 'L':
+            # L: scalar of 64-bit signed
+            resize = get_resize(default_width, 64)
+            cur_mod = f'1S{resize}'
+        elif c == 'U':
+            # I: scalar of 32-bit unsigned
+            resize = get_resize(default_width, 32)
+            cur_mod = f'1U{resize}'
+        elif c == 'O':
+            # O: scalar of 64-bit unsigned
+            resize = get_resize(default_width, 64)
+            cur_mod = f'1U{resize}'
+        elif c == 'f':
+            # f: float (int args)
+            resize = get_resize(default_width, 32)
+            cur_mod = f'F{resize}'
+        elif c == 'F':
+            # F: double (int args)
+            resize = get_resize(default_width, 64)
+            cur_mod = f'F{resize}'
+        elif c == 'H':
+            # H: half (int args)
+            resize = get_resize(default_width, 16)
+            cur_mod = f'F{resize}'
+        elif c == '0':
+            # 0: half (int args), ignore 'Q' size modifier.
+            resize = get_resize(default_width, 16)
+            cur_mod = f'Fq{resize}'
+        elif c == '1':
+            # 1: half (int args), force 'Q' size modifier.
+            resize = get_resize(default_width, 16)
+            cur_mod = f'FQ{resize}'
+
+        if len(cur_mod) == 0:
+            raise Exception(f'WTF: {c} in {name}')
+
+        if key_type != 0 and key_type == i:
+            cur_mod += '!'
+
+        if len(cur_mod) == 1:
+            res += cur_mod
+        else:
+            res += '(' + cur_mod + ')'
+
+    return res
+
+def replace_insts(m):
+    start, end = m.span('proto')
+    start -= m.start()
+    end -= m.start()
+    new_proto = remap_protocol(m['proto'], m['kinds'], m['name'])
+    return m.group()[:start] + new_proto + m.group()[end:]
+
+INST = re.compile(r'Inst<"(?P<name>.*?)",\s*"(?P<proto>.*?)",\s*"(?P<kinds>.*?)"')
+
+new_td = INST.sub(replace_insts, sys.stdin.read())
+sys.stdout.write(new_td)
diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
index fd895b767d9e6..3e069eba69b46 100644
--- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h
+++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h
@@ -263,6 +263,12 @@ class FuzzedDataProvider {
     // which seems to be a natural choice for other implementations as well.
     // To increase the odds even more, we also call |shrink_to_fit| below.
     std::vector<T> result(size);
+    if (size == 0) {
+      if (num_bytes_to_consume != 0)
+        abort();
+      return result;
+    }
+
     std::memcpy(result.data(), data_ptr_, num_bytes_to_consume);
     Advance(num_bytes_to_consume);
 
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 1a63aad0e8f66..feacd21d0865b 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -241,6 +241,13 @@ set(x86_ARCH_SOURCES
   powixf2.c
 )
 
+if (NOT MSVC)
+  set(x86_ARCH_SOURCES
+    ${x86_ARCH_SOURCES}
+    i386/fp_mode.c
+  )
+endif ()
+
 if (NOT MSVC)
   set(x86_64_SOURCES
     ${GENERIC_TF_SOURCES}
diff --git a/compiler-rt/lib/builtins/i386/fp_mode.c b/compiler-rt/lib/builtins/i386/fp_mode.c
new file mode 100644
index 0000000000000..62ab771222c09
--- /dev/null
+++ b/compiler-rt/lib/builtins/i386/fp_mode.c
@@ -0,0 +1,39 @@
+//===----- lib/i386/fp_mode.c - Floaing-point mode utilities -----*- C -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../fp_mode.h"
+
+#define X87_TONEAREST  0x0000
+#define X87_DOWNWARD   0x0400
+#define X87_UPWARD     0x0800
+#define X87_TOWARDZERO 0x0c00
+#define X87_RMODE_MASK (X87_TONEAREST | X87_UPWARD | X87_DOWNWARD | X87_TOWARDZERO)
+
+FE_ROUND_MODE __fe_getround() {
+  // Assume that the rounding mode state for the fpu agrees with the SSE unit.
+  unsigned short cw;
+  __asm__ __volatile__ ("fnstcw %0" : "=m" (cw));
+
+  switch (cw & X87_RMODE_MASK) {
+    case X87_TONEAREST:
+      return FE_TONEAREST;
+    case X87_DOWNWARD:
+      return FE_DOWNWARD;
+    case X87_UPWARD:
+      return FE_UPWARD;
+    case X87_TOWARDZERO:
+      return FE_TOWARDZERO;
+  }
+  return FE_TONEAREST;
+}
+
+int __fe_raise_inexact() {
+  float f = 1.0f, g = 3.0f;
+  __asm__ __volatile__ ("fdivs %1" : "+t" (f) : "m" (g));
+  return 0;
+}
diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 606139f2e1787..5df8c0ac91063 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -371,12 +371,13 @@ static void PrintTagInfoAroundAddr(tag_t *tag_ptr, uptr num_rows,
   InternalScopedString s(GetPageSizeCached() * 8);
   for (tag_t *row = beg_row; row < end_row; row += row_len) {
     s.append("%s", row == center_row_beg ? "=>" : "  ");
+    s.append("%p:", row);
     for (uptr i = 0; i < row_len; i++) {
       s.append("%s", row + i == tag_ptr ? "[" : " ");
       print_tag(s, &row[i]);
       s.append("%s", row + i == tag_ptr ? "]" : " ");
     }
-    s.append("%s\n", row == center_row_beg ? "<=" : "  ");
+    s.append("\n");
   }
   Printf("%s", s.data());
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingUtil.c b/compiler-rt/lib/profile/InstrProfilingUtil.c
index 13301f341fc5a..bf5a9670fe18c 100644
--- a/compiler-rt/lib/profile/InstrProfilingUtil.c
+++ b/compiler-rt/lib/profile/InstrProfilingUtil.c
@@ -207,8 +207,9 @@ COMPILER_RT_VISIBILITY FILE *lprofOpenFileEx(const char *ProfileName) {
   f = fdopen(fd, "r+b");
 #elif defined(_WIN32)
   // FIXME: Use the wide variants to handle Unicode filenames.
-  HANDLE h = CreateFileA(ProfileName, GENERIC_READ | GENERIC_WRITE, 0, 0,
-                         OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0);
+  HANDLE h = CreateFileA(ProfileName, GENERIC_READ | GENERIC_WRITE,
+                         FILE_SHARE_READ | FILE_SHARE_WRITE, 0, OPEN_ALWAYS,
+                         FILE_ATTRIBUTE_NORMAL, 0);
   if (h == INVALID_HANDLE_VALUE)
     return NULL;
 
@@ -218,6 +219,10 @@ COMPILER_RT_VISIBILITY FILE *lprofOpenFileEx(const char *ProfileName) {
     return NULL;
   }
 
+  if (lprofLockFd(fd) != 0)
+    PROF_WARN("Data may be corrupted during profile merging : %s\n",
+              "Fail to obtain file lock due to system limit.");
+
   f = _fdopen(fd, "r+b");
   if (f == 0) {
     CloseHandle(h);
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
index 166e19e2b8f28..1d00a5d76d04d 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.h
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -67,7 +67,7 @@ struct AndroidSvelteConfig {
 struct FuchsiaConfig {
   // 1GB Regions
   typedef SizeClassAllocator64<DefaultSizeClassMap, 30U> Primary;
-  typedef MapAllocator<> Secondary;
+  typedef MapAllocator<0U> Secondary;
   template <class A>
   using TSDRegistryT = TSDRegistrySharedT<A, 8U>; // Shared, max 8 TSDs.
 };
diff --git a/compiler-rt/lib/scudo/standalone/atomic_helpers.h b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
index 47037d764e252..6c84ba86ed329 100644
--- a/compiler-rt/lib/scudo/standalone/atomic_helpers.h
+++ b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
@@ -21,12 +21,12 @@ enum memory_order {
   memory_order_acq_rel = 4,
   memory_order_seq_cst = 5
 };
-COMPILER_CHECK(memory_order_relaxed == __ATOMIC_RELAXED);
-COMPILER_CHECK(memory_order_consume == __ATOMIC_CONSUME);
-COMPILER_CHECK(memory_order_acquire == __ATOMIC_ACQUIRE);
-COMPILER_CHECK(memory_order_release == __ATOMIC_RELEASE);
-COMPILER_CHECK(memory_order_acq_rel == __ATOMIC_ACQ_REL);
-COMPILER_CHECK(memory_order_seq_cst == __ATOMIC_SEQ_CST);
+static_assert(memory_order_relaxed == __ATOMIC_RELAXED, "");
+static_assert(memory_order_consume == __ATOMIC_CONSUME, "");
+static_assert(memory_order_acquire == __ATOMIC_ACQUIRE, "");
+static_assert(memory_order_release == __ATOMIC_RELEASE, "");
+static_assert(memory_order_acq_rel == __ATOMIC_ACQ_REL, "");
+static_assert(memory_order_seq_cst == __ATOMIC_SEQ_CST, "");
 
 struct atomic_u8 {
   typedef u8 Type;
@@ -60,7 +60,7 @@ struct atomic_uptr {
 };
 
 template <typename T>
-INLINE typename T::Type atomic_load(const volatile T *A, memory_order MO) {
+inline typename T::Type atomic_load(const volatile T *A, memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
   typename T::Type V;
   __atomic_load(&A->ValDoNotUse, &V, MO);
@@ -68,29 +68,29 @@ INLINE typename T::Type atomic_load(const volatile T *A, memory_order MO) {
 }
 
 template <typename T>
-INLINE void atomic_store(volatile T *A, typename T::Type V, memory_order MO) {
+inline void atomic_store(volatile T *A, typename T::Type V, memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
   __atomic_store(&A->ValDoNotUse, &V, MO);
 }
 
-INLINE void atomic_thread_fence(memory_order) { __sync_synchronize(); }
+inline void atomic_thread_fence(memory_order) { __sync_synchronize(); }
 
 template <typename T>
-INLINE typename T::Type atomic_fetch_add(volatile T *A, typename T::Type V,
+inline typename T::Type atomic_fetch_add(volatile T *A, typename T::Type V,
                                          memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
   return __atomic_fetch_add(&A->ValDoNotUse, V, MO);
 }
 
 template <typename T>
-INLINE typename T::Type atomic_fetch_sub(volatile T *A, typename T::Type V,
+inline typename T::Type atomic_fetch_sub(volatile T *A, typename T::Type V,
                                          memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
   return __atomic_fetch_sub(&A->ValDoNotUse, V, MO);
 }
 
 template <typename T>
-INLINE typename T::Type atomic_exchange(volatile T *A, typename T::Type V,
+inline typename T::Type atomic_exchange(volatile T *A, typename T::Type V,
                                         memory_order MO) {
   DCHECK(!(reinterpret_cast<uptr>(A) % sizeof(*A)));
   typename T::Type R;
@@ -99,7 +99,7 @@ INLINE typename T::Type atomic_exchange(volatile T *A, typename T::Type V,
 }
 
 template <typename T>
-INLINE bool atomic_compare_exchange_strong(volatile T *A, typename T::Type *Cmp,
+inline bool atomic_compare_exchange_strong(volatile T *A, typename T::Type *Cmp,
                                            typename T::Type Xchg,
                                            memory_order MO) {
   return __atomic_compare_exchange(&A->ValDoNotUse, Cmp, &Xchg, false, MO,
@@ -107,7 +107,7 @@ INLINE bool atomic_compare_exchange_strong(volatile T *A, typename T::Type *Cmp,
 }
 
 template <typename T>
-INLINE bool atomic_compare_exchange_weak(volatile T *A, typename T::Type *Cmp,
+inline bool atomic_compare_exchange_weak(volatile T *A, typename T::Type *Cmp,
                                          typename T::Type Xchg,
                                          memory_order MO) {
   return __atomic_compare_exchange(&A->ValDoNotUse, Cmp, &Xchg, true, MO,
@@ -117,17 +117,17 @@ INLINE bool atomic_compare_exchange_weak(volatile T *A, typename T::Type *Cmp,
 // Clutter-reducing helpers.
 
 template <typename T>
-INLINE typename T::Type atomic_load_relaxed(const volatile T *A) {
+inline typename T::Type atomic_load_relaxed(const volatile T *A) {
   return atomic_load(A, memory_order_relaxed);
 }
 
 template <typename T>
-INLINE void atomic_store_relaxed(volatile T *A, typename T::Type V) {
+inline void atomic_store_relaxed(volatile T *A, typename T::Type V) {
   atomic_store(A, V, memory_order_relaxed);
 }
 
 template <typename T>
-INLINE typename T::Type atomic_compare_exchange(volatile T *A,
+inline typename T::Type atomic_compare_exchange(volatile T *A,
                                                 typename T::Type Cmp,
                                                 typename T::Type Xchg) {
   atomic_compare_exchange_strong(A, &Cmp, Xchg, memory_order_acquire);
diff --git a/compiler-rt/lib/scudo/standalone/checksum.h b/compiler-rt/lib/scudo/standalone/checksum.h
index 092342fd6efbd..a63b1b4f064d1 100644
--- a/compiler-rt/lib/scudo/standalone/checksum.h
+++ b/compiler-rt/lib/scudo/standalone/checksum.h
@@ -37,7 +37,7 @@ enum class Checksum : u8 {
 // significantly on memory accesses, as well as 1K of CRC32 table, on platforms
 // that do no support hardware CRC32. The checksum itself is 16-bit, which is at
 // odds with CRC32, but enough for our needs.
-INLINE u16 computeBSDChecksum(u16 Sum, uptr Data) {
+inline u16 computeBSDChecksum(u16 Sum, uptr Data) {
   for (u8 I = 0; I < sizeof(Data); I++) {
     Sum = static_cast<u16>((Sum >> 1) | ((Sum & 1) << 15));
     Sum = static_cast<u16>(Sum + (Data & 0xff));
diff --git a/compiler-rt/lib/scudo/standalone/chunk.h b/compiler-rt/lib/scudo/standalone/chunk.h
index 9ae75823ba778..f4d68b3ac6c4a 100644
--- a/compiler-rt/lib/scudo/standalone/chunk.h
+++ b/compiler-rt/lib/scudo/standalone/chunk.h
@@ -20,7 +20,7 @@ namespace scudo {
 
 extern Checksum HashAlgorithm;
 
-INLINE u16 computeChecksum(u32 Seed, uptr Value, uptr *Array, uptr ArraySize) {
+inline u16 computeChecksum(u32 Seed, uptr Value, uptr *Array, uptr ArraySize) {
   // If the hardware CRC32 feature is defined here, it was enabled everywhere,
   // as opposed to only for crc32_hw.cpp. This means that other hardware
   // specific instructions were likely emitted at other places, and as a result
@@ -71,7 +71,7 @@ struct UnpackedHeader {
   uptr Checksum : 16;
 };
 typedef atomic_u64 AtomicPackedHeader;
-COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));
+static_assert(sizeof(UnpackedHeader) == sizeof(PackedHeader), "");
 
 // Those constants are required to silence some -Werror=conversion errors when
 // assigning values to the related bitfield variables.
@@ -86,13 +86,12 @@ constexpr uptr getHeaderSize() {
   return roundUpTo(sizeof(PackedHeader), 1U << SCUDO_MIN_ALIGNMENT_LOG);
 }
 
-INLINE AtomicPackedHeader *getAtomicHeader(void *Ptr) {
+inline AtomicPackedHeader *getAtomicHeader(void *Ptr) {
   return reinterpret_cast<AtomicPackedHeader *>(reinterpret_cast<uptr>(Ptr) -
                                                 getHeaderSize());
 }
 
-INLINE
-const AtomicPackedHeader *getConstAtomicHeader(const void *Ptr) {
+inline const AtomicPackedHeader *getConstAtomicHeader(const void *Ptr) {
   return reinterpret_cast<const AtomicPackedHeader *>(
       reinterpret_cast<uptr>(Ptr) - getHeaderSize());
 }
@@ -100,7 +99,7 @@ const AtomicPackedHeader *getConstAtomicHeader(const void *Ptr) {
 // We do not need a cryptographically strong hash for the checksum, but a CRC
 // type function that can alert us in the event a header is invalid or
 // corrupted. Ideally slightly better than a simple xor of all fields.
-static INLINE u16 computeHeaderChecksum(u32 Cookie, const void *Ptr,
+static inline u16 computeHeaderChecksum(u32 Cookie, const void *Ptr,
                                         UnpackedHeader *Header) {
   UnpackedHeader ZeroChecksumHeader = *Header;
   ZeroChecksumHeader.Checksum = 0;
@@ -110,7 +109,7 @@ static INLINE u16 computeHeaderChecksum(u32 Cookie, const void *Ptr,
                          ARRAY_SIZE(HeaderHolder));
 }
 
-INLINE void storeHeader(u32 Cookie, void *Ptr,
+inline void storeHeader(u32 Cookie, void *Ptr,
                         UnpackedHeader *NewUnpackedHeader) {
   NewUnpackedHeader->Checksum =
       computeHeaderChecksum(Cookie, Ptr, NewUnpackedHeader);
@@ -118,9 +117,8 @@ INLINE void storeHeader(u32 Cookie, void *Ptr,
   atomic_store_relaxed(getAtomicHeader(Ptr), NewPackedHeader);
 }
 
-INLINE
-void loadHeader(u32 Cookie, const void *Ptr,
-                UnpackedHeader *NewUnpackedHeader) {
+inline void loadHeader(u32 Cookie, const void *Ptr,
+                       UnpackedHeader *NewUnpackedHeader) {
   PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr));
   *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
   if (UNLIKELY(NewUnpackedHeader->Checksum !=
@@ -128,7 +126,7 @@ void loadHeader(u32 Cookie, const void *Ptr,
     reportHeaderCorruption(const_cast<void *>(Ptr));
 }
 
-INLINE void compareExchangeHeader(u32 Cookie, void *Ptr,
+inline void compareExchangeHeader(u32 Cookie, void *Ptr,
                                   UnpackedHeader *NewUnpackedHeader,
                                   UnpackedHeader *OldUnpackedHeader) {
   NewUnpackedHeader->Checksum =
@@ -141,8 +139,8 @@ INLINE void compareExchangeHeader(u32 Cookie, void *Ptr,
     reportHeaderRace(Ptr);
 }
 
-INLINE
-bool isValid(u32 Cookie, const void *Ptr, UnpackedHeader *NewUnpackedHeader) {
+inline bool isValid(u32 Cookie, const void *Ptr,
+                    UnpackedHeader *NewUnpackedHeader) {
   PackedHeader NewPackedHeader = atomic_load_relaxed(getConstAtomicHeader(Ptr));
   *NewUnpackedHeader = bit_cast<UnpackedHeader>(NewPackedHeader);
   return NewUnpackedHeader->Checksum ==
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 8560c2d3599f3..b355a4746fae3 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -184,7 +184,7 @@ template <class Params> class Allocator {
         ((Alignment > MinAlignment) ? Alignment : Chunk::getHeaderSize());
 
     // Takes care of extravagantly large sizes as well as integer overflows.
-    COMPILER_CHECK(MaxAllowedMallocSize < UINTPTR_MAX - MaxAlignment);
+    static_assert(MaxAllowedMallocSize < UINTPTR_MAX - MaxAlignment, "");
     if (UNLIKELY(Size >= MaxAllowedMallocSize)) {
       if (Options.MayReturnNull)
         return nullptr;
@@ -457,6 +457,18 @@ template <class Params> class Allocator {
     Stats.get(S);
   }
 
+  // Returns true if the pointer provided was allocated by the current
+  // allocator instance, which is compliant with tcmalloc's ownership concept.
+  // A corrupted chunk will not be reported as owned, which is WAI.
+  bool isOwned(const void *Ptr) {
+    initThreadMaybe();
+    if (!Ptr || !isAligned(reinterpret_cast<uptr>(Ptr), MinAlignment))
+      return false;
+    Chunk::UnpackedHeader Header;
+    return Chunk::isValid(Cookie, Ptr, &Header) &&
+           Header.State == Chunk::State::Allocated;
+  }
+
 private:
   using SecondaryT = typename Params::Secondary;
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
@@ -468,6 +480,9 @@ template <class Params> class Allocator {
   static const uptr MaxAllowedMallocSize =
       FIRST_32_SECOND_64(1UL << 31, 1ULL << 40);
 
+  static_assert(MinAlignment >= sizeof(Chunk::PackedHeader),
+                "Minimal alignment must at least cover a chunk header.");
+
   // Constants used by the chunk iteration mechanism.
   static const u32 BlockMarker = 0x44554353U;
   static const uptr InvalidChunk = ~static_cast<uptr>(0);
@@ -523,7 +538,7 @@ template <class Params> class Allocator {
       reportSanityCheckError("class ID");
   }
 
-  static INLINE void *getBlockBegin(const void *Ptr,
+  static inline void *getBlockBegin(const void *Ptr,
                                     Chunk::UnpackedHeader *Header) {
     return reinterpret_cast<void *>(
         reinterpret_cast<uptr>(Ptr) - Chunk::getHeaderSize() -
@@ -531,7 +546,7 @@ template <class Params> class Allocator {
   }
 
   // Return the size of a chunk as requested during its allocation.
-  INLINE uptr getSize(const void *Ptr, Chunk::UnpackedHeader *Header) {
+  inline uptr getSize(const void *Ptr, Chunk::UnpackedHeader *Header) {
     const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
     if (LIKELY(Header->ClassId))
       return SizeOrUnusedBytes;
diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
index c015d1ca56696..a76eb6bbc1645 100644
--- a/compiler-rt/lib/scudo/standalone/common.h
+++ b/compiler-rt/lib/scudo/standalone/common.h
@@ -19,22 +19,22 @@
 
 namespace scudo {
 
-template <class Dest, class Source> INLINE Dest bit_cast(const Source &S) {
-  COMPILER_CHECK(sizeof(Dest) == sizeof(Source));
+template <class Dest, class Source> inline Dest bit_cast(const Source &S) {
+  static_assert(sizeof(Dest) == sizeof(Source), "");
   Dest D;
   memcpy(&D, &S, sizeof(D));
   return D;
 }
 
-INLINE constexpr uptr roundUpTo(uptr X, uptr Boundary) {
+inline constexpr uptr roundUpTo(uptr X, uptr Boundary) {
   return (X + Boundary - 1) & ~(Boundary - 1);
 }
 
-INLINE constexpr uptr roundDownTo(uptr X, uptr Boundary) {
+inline constexpr uptr roundDownTo(uptr X, uptr Boundary) {
   return X & ~(Boundary - 1);
 }
 
-INLINE constexpr bool isAligned(uptr X, uptr Alignment) {
+inline constexpr bool isAligned(uptr X, uptr Alignment) {
   return (X & (Alignment - 1)) == 0;
 }
 
@@ -48,14 +48,14 @@ template <class T> void Swap(T &A, T &B) {
   B = Tmp;
 }
 
-INLINE bool isPowerOfTwo(uptr X) { return (X & (X - 1)) == 0; }
+inline bool isPowerOfTwo(uptr X) { return (X & (X - 1)) == 0; }
 
-INLINE uptr getMostSignificantSetBitIndex(uptr X) {
+inline uptr getMostSignificantSetBitIndex(uptr X) {
   DCHECK_NE(X, 0U);
   return SCUDO_WORDSIZE - 1U - static_cast<uptr>(__builtin_clzl(X));
 }
 
-INLINE uptr roundUpToPowerOfTwo(uptr Size) {
+inline uptr roundUpToPowerOfTwo(uptr Size) {
   DCHECK(Size);
   if (isPowerOfTwo(Size))
     return Size;
@@ -65,17 +65,17 @@ INLINE uptr roundUpToPowerOfTwo(uptr Size) {
   return 1UL << (Up + 1);
 }
 
-INLINE uptr getLeastSignificantSetBitIndex(uptr X) {
+inline uptr getLeastSignificantSetBitIndex(uptr X) {
   DCHECK_NE(X, 0U);
   return static_cast<uptr>(__builtin_ctzl(X));
 }
 
-INLINE uptr getLog2(uptr X) {
+inline uptr getLog2(uptr X) {
   DCHECK(isPowerOfTwo(X));
   return getLeastSignificantSetBitIndex(X);
 }
 
-INLINE u32 getRandomU32(u32 *State) {
+inline u32 getRandomU32(u32 *State) {
   // ANSI C linear congruential PRNG (16-bit output).
   // return (*State = *State * 1103515245 + 12345) >> 16;
   // XorShift (32-bit output).
@@ -85,11 +85,11 @@ INLINE u32 getRandomU32(u32 *State) {
   return *State;
 }
 
-INLINE u32 getRandomModN(u32 *State, u32 N) {
+inline u32 getRandomModN(u32 *State, u32 N) {
   return getRandomU32(State) % N; // [0, N)
 }
 
-template <typename T> INLINE void shuffle(T *A, u32 N, u32 *RandState) {
+template <typename T> inline void shuffle(T *A, u32 N, u32 *RandState) {
   if (N <= 1)
     return;
   u32 State = *RandState;
@@ -100,7 +100,7 @@ template <typename T> INLINE void shuffle(T *A, u32 N, u32 *RandState) {
 
 // Hardware specific inlinable functions.
 
-INLINE void yieldProcessor(u8 Count) {
+inline void yieldProcessor(u8 Count) {
 #if defined(__i386__) || defined(__x86_64__)
   __asm__ __volatile__("" ::: "memory");
   for (u8 I = 0; I < Count; I++)
@@ -117,7 +117,7 @@ INLINE void yieldProcessor(u8 Count) {
 
 extern uptr PageSizeCached;
 uptr getPageSizeSlow();
-INLINE uptr getPageSizeCached() {
+inline uptr getPageSizeCached() {
   // Bionic uses a hardcoded value.
   if (SCUDO_ANDROID)
     return 4096U;
diff --git a/compiler-rt/lib/scudo/standalone/flags_parser.cpp b/compiler-rt/lib/scudo/standalone/flags_parser.cpp
index 070c08b019384..be39fcd4f8879 100644
--- a/compiler-rt/lib/scudo/standalone/flags_parser.cpp
+++ b/compiler-rt/lib/scudo/standalone/flags_parser.cpp
@@ -108,7 +108,7 @@ void FlagParser::parseString(const char *S) {
   Pos = OldPos;
 }
 
-INLINE bool parseBool(const char *Value, bool *b) {
+inline bool parseBool(const char *Value, bool *b) {
   if (strncmp(Value, "0", 1) == 0 || strncmp(Value, "no", 2) == 0 ||
       strncmp(Value, "false", 5) == 0) {
     *b = false;
diff --git a/compiler-rt/lib/scudo/standalone/fuchsia.cpp b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
index 0a9483ae1dd0d..b3d72de158cf9 100644
--- a/compiler-rt/lib/scudo/standalone/fuchsia.cpp
+++ b/compiler-rt/lib/scudo/standalone/fuchsia.cpp
@@ -29,7 +29,7 @@ void NORETURN die() { __builtin_trap(); }
 
 // We zero-initialize the Extra parameter of map(), make sure this is consistent
 // with ZX_HANDLE_INVALID.
-COMPILER_CHECK(ZX_HANDLE_INVALID == 0);
+static_assert(ZX_HANDLE_INVALID == 0, "");
 
 static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
   // Only scenario so far.
@@ -171,7 +171,7 @@ u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
 u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }
 
 bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
-  COMPILER_CHECK(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN);
+  static_assert(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN, "");
   if (UNLIKELY(!Buffer || !Length || Length > MaxRandomLength))
     return false;
   _zx_cprng_draw(Buffer, Length);
diff --git a/compiler-rt/lib/scudo/standalone/internal_defs.h b/compiler-rt/lib/scudo/standalone/internal_defs.h
index f80c0f621a462..8f6a89ecba737 100644
--- a/compiler-rt/lib/scudo/standalone/internal_defs.h
+++ b/compiler-rt/lib/scudo/standalone/internal_defs.h
@@ -30,7 +30,6 @@
 
 #define INTERFACE __attribute__((visibility("default")))
 #define WEAK __attribute__((weak))
-#define INLINE inline
 #define ALWAYS_INLINE inline __attribute__((always_inline))
 #define ALIAS(X) __attribute__((alias(X)))
 // Please only use the ALIGNED macro before the type. Using ALIGNED after the
@@ -126,8 +125,6 @@ void NORETURN reportCheckFailed(const char *File, int Line,
     die();                                                                     \
   } while (0)
 
-#define COMPILER_CHECK(Pred) static_assert(Pred, "")
-
 } // namespace scudo
 
 #endif // SCUDO_INTERNAL_DEFS_H_
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index a0d8560c3f6c9..945324914d30f 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -42,7 +42,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
 public:
   typedef SizeClassMapT SizeClassMap;
   // Regions should be large enough to hold the largest Block.
-  COMPILER_CHECK((1UL << RegionSizeLog) >= SizeClassMap::MaxSize);
+  static_assert((1UL << RegionSizeLog) >= SizeClassMap::MaxSize, "");
   typedef SizeClassAllocator32<SizeClassMapT, RegionSizeLog> ThisT;
   typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
   typedef typename CacheT::TransferBatch TransferBatch;
@@ -204,7 +204,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     uptr AllocatedUser;
     ReleaseToOsInfo ReleaseInfo;
   };
-  COMPILER_CHECK(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0);
+  static_assert(sizeof(SizeClassInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr computeRegionId(uptr Mem) {
     const uptr Id = Mem >> RegionSizeLog;
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 559742d05ad9e..b208ff69bb055 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -215,7 +215,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator64 {
     MapPlatformData Data;
     ReleaseToOsInfo ReleaseInfo;
   };
-  COMPILER_CHECK(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0);
+  static_assert(sizeof(RegionInfo) % SCUDO_CACHE_LINE_SIZE == 0, "");
 
   uptr PrimaryBase;
   RegionInfo *RegionInfoArray;
diff --git a/compiler-rt/lib/scudo/standalone/quarantine.h b/compiler-rt/lib/scudo/standalone/quarantine.h
index 4b3f368ad9659..2bf7e804ef359 100644
--- a/compiler-rt/lib/scudo/standalone/quarantine.h
+++ b/compiler-rt/lib/scudo/standalone/quarantine.h
@@ -59,7 +59,7 @@ struct QuarantineBatch {
   void shuffle(u32 State) { ::scudo::shuffle(Batch, Count, &State); }
 };
 
-COMPILER_CHECK(sizeof(QuarantineBatch) <= (1U << 13)); // 8Kb.
+static_assert(sizeof(QuarantineBatch) <= (1U << 13), ""); // 8Kb.
 
 // Per-thread cache of memory blocks.
 template <typename Callback> class QuarantineCache {
diff --git a/compiler-rt/lib/scudo/standalone/report.cpp b/compiler-rt/lib/scudo/standalone/report.cpp
index 12d851ff019ad..80cc6eda2af92 100644
--- a/compiler-rt/lib/scudo/standalone/report.cpp
+++ b/compiler-rt/lib/scudo/standalone/report.cpp
@@ -34,7 +34,7 @@ class ScopedErrorReport {
   ScopedString Message;
 };
 
-INLINE void NORETURN trap() { __builtin_trap(); }
+inline void NORETURN trap() { __builtin_trap(); }
 
 // This could potentially be called recursively if a CHECK fails in the reports.
 void NORETURN reportCheckFailed(const char *File, int Line,
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index f288fc7d7592b..ab68e5a1d38d7 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -50,6 +50,10 @@ static Header *getHeader(const void *Ptr) {
 
 template <uptr MaxFreeListSize = 32U> class MapAllocator {
 public:
+  // Ensure the freelist is disabled on Fuchsia, since it doesn't support
+  // releasing Secondary blocks yet.
+  static_assert(!SCUDO_FUCHSIA || MaxFreeListSize == 0U, "");
+
   void initLinkerInitialized(GlobalStats *S) {
     Stats.initLinkerInitialized();
     if (LIKELY(S))
@@ -205,10 +209,11 @@ void *MapAllocator<MaxFreeListSize>::allocate(uptr Size, uptr AlignmentHint,
 template <uptr MaxFreeListSize>
 void MapAllocator<MaxFreeListSize>::deallocate(void *Ptr) {
   LargeBlock::Header *H = LargeBlock::getHeader(Ptr);
+  const uptr Block = reinterpret_cast<uptr>(H);
   {
     ScopedLock L(Mutex);
     InUseBlocks.remove(H);
-    const uptr CommitSize = H->BlockEnd - reinterpret_cast<uptr>(H);
+    const uptr CommitSize = H->BlockEnd - Block;
     FreedBytes += CommitSize;
     NumberOfFrees++;
     Stats.sub(StatAllocated, CommitSize);
@@ -225,11 +230,10 @@ void MapAllocator<MaxFreeListSize>::deallocate(void *Ptr) {
       if (!Inserted)
         FreeBlocks.push_back(H);
       const uptr RoundedAllocationStart =
-          roundUpTo(reinterpret_cast<uptr>(H) + LargeBlock::getHeaderSize(),
-                    getPageSizeCached());
+          roundUpTo(Block + LargeBlock::getHeaderSize(), getPageSizeCached());
       MapPlatformData Data = H->Data;
       // TODO(kostyak): use release_to_os_interval_ms
-      releasePagesToOS(H->MapBase, RoundedAllocationStart - H->MapBase,
+      releasePagesToOS(Block, RoundedAllocationStart - Block,
                        H->BlockEnd - RoundedAllocationStart, &Data);
       return;
     }
diff --git a/compiler-rt/lib/scudo/standalone/size_class_map.h b/compiler-rt/lib/scudo/standalone/size_class_map.h
index 59d6ede57ed27..947526e8aea17 100644
--- a/compiler-rt/lib/scudo/standalone/size_class_map.h
+++ b/compiler-rt/lib/scudo/standalone/size_class_map.h
@@ -49,7 +49,7 @@ class SizeClassMap {
   static const uptr MaxSize = 1UL << MaxSizeLog;
   static const uptr NumClasses =
       MidClass + ((MaxSizeLog - MidSizeLog) << S) + 1;
-  COMPILER_CHECK(NumClasses <= 256);
+  static_assert(NumClasses <= 256, "");
   static const uptr LargestClassId = NumClasses - 1;
   static const uptr BatchClassId = 0;
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/atomic_test.cpp b/compiler-rt/lib/scudo/standalone/tests/atomic_test.cpp
index 7e6f1d21f6e9c..103cd24624ba5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/atomic_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/atomic_test.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/atomic_helpers.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "atomic_helpers.h"
 
 namespace scudo {
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/bytemap_test.cpp b/compiler-rt/lib/scudo/standalone/tests/bytemap_test.cpp
index df0646bcd99d0..7db7feb6accdc 100644
--- a/compiler-rt/lib/scudo/standalone/tests/bytemap_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/bytemap_test.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "bytemap.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "bytemap.h"
 
+#include <pthread.h>
 #include <string.h>
 
 template <typename T> void testMap(T &Map, scudo::uptr Size) {
diff --git a/compiler-rt/lib/scudo/standalone/tests/checksum_test.cpp b/compiler-rt/lib/scudo/standalone/tests/checksum_test.cpp
index 43bbd47a3c35a..361d33c7e4641 100644
--- a/compiler-rt/lib/scudo/standalone/tests/checksum_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/checksum_test.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "checksum.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "checksum.h"
 
 #include <string.h>
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp b/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
index 57e128ec82666..13da70eff85b8 100644
--- a/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/chunk_test.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "chunk.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "chunk.h"
 
 #include <stdlib.h>
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 9205467998ed1..fec5f864aeb7d 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -6,14 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "tests/scudo_unit_test.h"
+
 #include "allocator_config.h"
 #include "combined.h"
 
-#include "gtest/gtest.h"
-
 #include <condition_variable>
 #include <mutex>
 #include <thread>
+#include <vector>
 
 static std::mutex Mutex;
 static std::condition_variable Cv;
@@ -21,17 +22,6 @@ static bool Ready = false;
 
 static constexpr scudo::Chunk::Origin Origin = scudo::Chunk::Origin::Malloc;
 
-// This allows us to turn on the Quarantine for specific tests. The Quarantine
-// parameters are on the low end, to avoid having to loop excessively in some
-// tests.
-static bool UseQuarantine = false;
-extern "C" const char *__scudo_default_options() {
-  if (!UseQuarantine)
-    return "";
-  return "quarantine_size_kb=256:thread_local_quarantine_size_kb=128:"
-         "quarantine_max_chunk_size=1024";
-}
-
 template <class Config> static void testAllocator() {
   using AllocatorT = scudo::Allocator<Config>;
   auto Deleter = [](AllocatorT *A) {
@@ -42,6 +32,12 @@ template <class Config> static void testAllocator() {
                                                            Deleter);
   Allocator->reset();
 
+  EXPECT_FALSE(Allocator->isOwned(&Mutex));
+  EXPECT_FALSE(Allocator->isOwned(&Allocator));
+  scudo::u64 StackVariable = 0x42424242U;
+  EXPECT_FALSE(Allocator->isOwned(&StackVariable));
+  EXPECT_EQ(StackVariable, 0x42424242U);
+
   constexpr scudo::uptr MinAlignLog = FIRST_32_SECOND_64(3U, 4U);
 
   // This allocates and deallocates a bunch of chunks, with a wide range of
@@ -56,6 +52,7 @@ template <class Config> static void testAllocator() {
         const scudo::uptr Size = (1U << SizeLog) + Delta;
         void *P = Allocator->allocate(Size, Origin, Align);
         EXPECT_NE(P, nullptr);
+        EXPECT_TRUE(Allocator->isOwned(P));
         EXPECT_TRUE(scudo::isAligned(reinterpret_cast<scudo::uptr>(P), Align));
         EXPECT_LE(Size, Allocator->getUsableSize(P));
         memset(P, 0xaa, Size);
@@ -121,7 +118,7 @@ template <class Config> static void testAllocator() {
     const scudo::uptr NewSize = DataSize + Delta;
     void *NewP = Allocator->reallocate(P, NewSize);
     EXPECT_EQ(NewP, P);
-    for (scudo::uptr I = 0; I < scudo::Min(DataSize, NewSize); I++)
+    for (scudo::uptr I = 0; I < DataSize - 32; I++)
       EXPECT_EQ((reinterpret_cast<char *>(NewP))[I], Marker);
   }
   Allocator->deallocate(P, Origin);
@@ -168,15 +165,15 @@ template <class Config> static void testAllocator() {
 }
 
 TEST(ScudoCombinedTest, BasicCombined) {
-  testAllocator<scudo::DefaultConfig>();
-#if SCUDO_WORDSIZE == 64U
+  UseQuarantine = false;
+  testAllocator<scudo::AndroidSvelteConfig>();
+#if SCUDO_FUCHSIA
   testAllocator<scudo::FuchsiaConfig>();
-#endif
-  // The following configs should work on all platforms.
+#else
+  testAllocator<scudo::DefaultConfig>();
   UseQuarantine = true;
   testAllocator<scudo::AndroidConfig>();
-  UseQuarantine = false;
-  testAllocator<scudo::AndroidSvelteConfig>();
+#endif
 }
 
 template <typename AllocatorT> static void stressAllocator(AllocatorT *A) {
@@ -223,20 +220,21 @@ template <class Config> static void testAllocatorThreaded() {
 }
 
 TEST(ScudoCombinedTest, ThreadedCombined) {
-  testAllocatorThreaded<scudo::DefaultConfig>();
-#if SCUDO_WORDSIZE == 64U
+  UseQuarantine = false;
+  testAllocatorThreaded<scudo::AndroidSvelteConfig>();
+#if SCUDO_FUCHSIA
   testAllocatorThreaded<scudo::FuchsiaConfig>();
-#endif
+#else
+  testAllocatorThreaded<scudo::DefaultConfig>();
   UseQuarantine = true;
   testAllocatorThreaded<scudo::AndroidConfig>();
-  UseQuarantine = false;
-  testAllocatorThreaded<scudo::AndroidSvelteConfig>();
+#endif
 }
 
 struct DeathConfig {
   // Tiny allocator, its Primary only serves chunks of 1024 bytes.
   using DeathSizeClassMap = scudo::SizeClassMap<1U, 10U, 10U, 10U, 1U, 10U>;
-  typedef scudo::SizeClassAllocator32<DeathSizeClassMap, 18U> Primary;
+  typedef scudo::SizeClassAllocator64<DeathSizeClassMap, 20U> Primary;
   typedef scudo::MapAllocator<0U> Secondary;
   template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U>;
 };
@@ -258,8 +256,8 @@ TEST(ScudoCombinedTest, DeathCombined) {
   // Invalid sized deallocation.
   EXPECT_DEATH(Allocator->deallocate(P, Origin, Size + 8U), "");
 
-  // Misaligned pointer.
-  void *MisalignedP =
+  // Misaligned pointer. Potentially unused if EXPECT_DEATH isn't available.
+  UNUSED void *MisalignedP =
       reinterpret_cast<void *>(reinterpret_cast<scudo::uptr>(P) | 1U);
   EXPECT_DEATH(Allocator->deallocate(MisalignedP, Origin, Size), "");
   EXPECT_DEATH(Allocator->reallocate(MisalignedP, Size * 2U), "");
diff --git a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
index 1c07bf13181c2..45918ad4d2ca0 100644
--- a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "tests/scudo_unit_test.h"
+
 #include "flags.h"
 #include "flags_parser.h"
 
-#include "gtest/gtest.h"
-
 #include <string.h>
 
 static const char FlagName[] = "flag_name";
diff --git a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
index 0a0c050c98cd5..8e139916d0588 100644
--- a/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/list_test.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/list.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "list.h"
 
 struct ListItem {
   ListItem *Next;
diff --git a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
index ab5dd8ca5fd6a..7c40b73ff2544 100644
--- a/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/map_test.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "common.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "common.h"
 
 #include <string.h>
 #include <unistd.h>
@@ -31,11 +31,10 @@ TEST(ScudoMapTest, MapNoAccessUnmap) {
 
 TEST(ScudoMapTest, MapUnmap) {
   const scudo::uptr Size = 4 * scudo::getPageSizeCached();
-  scudo::MapPlatformData Data = {};
-  void *P = scudo::map(nullptr, Size, MappingName, 0, &Data);
+  void *P = scudo::map(nullptr, Size, MappingName, 0, nullptr);
   EXPECT_NE(P, nullptr);
   memset(P, 0xaa, Size);
-  scudo::unmap(P, Size, 0, &Data);
+  scudo::unmap(P, Size, 0, nullptr);
   EXPECT_DEATH(memset(P, 0xbb, Size), "");
 }
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp b/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
index c75ef8edb3666..ce715a19332f4 100644
--- a/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/mutex_test.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mutex.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "mutex.h"
 
+#include <pthread.h>
 #include <string.h>
 
 class TestData {
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index 7da7b25ca67ed..64b625e79bf2d 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -6,15 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "tests/scudo_unit_test.h"
+
 #include "primary32.h"
 #include "primary64.h"
 #include "size_class_map.h"
 
-#include "gtest/gtest.h"
-
 #include <condition_variable>
 #include <mutex>
 #include <thread>
+#include <vector>
 
 // Note that with small enough regions, the SizeClassAllocator64 also works on
 // 32-bit architectures. It's not something we want to encourage, but we still
@@ -53,7 +54,9 @@ template <typename Primary> static void testPrimary() {
 
 TEST(ScudoPrimaryTest, BasicPrimary) {
   using SizeClassMap = scudo::DefaultSizeClassMap;
+#if !SCUDO_FUCHSIA
   testPrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
   testPrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }
 
@@ -78,7 +81,7 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
       AllocationFailed = true;
       break;
     }
-    for (scudo::uptr J = 0; J < B->getCount(); J++)
+    for (scudo::u32 J = 0; J < B->getCount(); J++)
       memset(B->get(J), 'B', Size);
     Batches.push_back(B);
   }
@@ -136,7 +139,9 @@ template <typename Primary> static void testIteratePrimary() {
 
 TEST(ScudoPrimaryTest, PrimaryIterate) {
   using SizeClassMap = scudo::DefaultSizeClassMap;
+#if !SCUDO_FUCHSIA
   testIteratePrimary<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
   testIteratePrimary<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }
 
@@ -193,7 +198,9 @@ template <typename Primary> static void testPrimaryThreaded() {
 
 TEST(ScudoPrimaryTest, PrimaryThreaded) {
   using SizeClassMap = scudo::SvelteSizeClassMap;
+#if !SCUDO_FUCHSIA
   testPrimaryThreaded<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
   testPrimaryThreaded<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }
 
@@ -221,6 +228,8 @@ template <typename Primary> static void testReleaseToOS() {
 
 TEST(ScudoPrimaryTest, ReleaseToOS) {
   using SizeClassMap = scudo::DefaultSizeClassMap;
+#if !SCUDO_FUCHSIA
   testReleaseToOS<scudo::SizeClassAllocator32<SizeClassMap, 18U>>();
+#endif
   testReleaseToOS<scudo::SizeClassAllocator64<SizeClassMap, 24U>>();
 }
diff --git a/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp b/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
index 28baf8feb653f..0422c2ff3736b 100644
--- a/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "quarantine.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "quarantine.h"
 
+#include <pthread.h>
 #include <stdlib.h>
 
 static void *FakePtr = reinterpret_cast<void *>(0xFA83FA83);
diff --git a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
index 3776768e9a848..22d73d09d53d7 100644
--- a/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/release_test.cpp
@@ -6,16 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "tests/scudo_unit_test.h"
+
 #include "list.h"
 #include "release.h"
 #include "size_class_map.h"
 
-#include "gtest/gtest.h"
-
 #include <string.h>
 
 #include <algorithm>
 #include <random>
+#include <set>
 
 TEST(ScudoReleaseTest, PackedCounterArray) {
   for (scudo::uptr I = 0; I < SCUDO_WORDSIZE; I++) {
diff --git a/compiler-rt/lib/scudo/standalone/tests/report_test.cpp b/compiler-rt/lib/scudo/standalone/tests/report_test.cpp
index c2f377d968491..09f03f1ac896d 100644
--- a/compiler-rt/lib/scudo/standalone/tests/report_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/report_test.cpp
@@ -6,11 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/report.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "report.h"
 
 TEST(ScudoReportTest, Generic) {
-  void *P = reinterpret_cast<void *>(0x42424242U);
+  // Potentially unused if EXPECT_DEATH isn't defined.
+  UNUSED void *P = reinterpret_cast<void *>(0x42424242U);
   EXPECT_DEATH(scudo::reportError("TEST123"), "Scudo ERROR.*TEST123");
   EXPECT_DEATH(scudo::reportInvalidFlag("ABC", "DEF"), "Scudo ERROR.*ABC.*DEF");
   EXPECT_DEATH(scudo::reportHeaderCorruption(P), "Scudo ERROR.*42424242");
diff --git a/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test.h b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test.h
new file mode 100644
index 0000000000000..55d039ef77c37
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test.h
@@ -0,0 +1,29 @@
+//===-- scudo_unit_test.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "platform.h"
+
+#if SCUDO_FUCHSIA
+#include <zxtest/zxtest.h>
+#else
+#include "gtest/gtest.h"
+#endif
+
+// If EXPECT_DEATH isn't defined, make it a no-op.
+#ifndef EXPECT_DEATH
+#define EXPECT_DEATH(X, Y)                                                     \
+  do {                                                                         \
+  } while (0)
+#endif
+
+// If EXPECT_STREQ isn't defined, define our own simple one.
+#ifndef EXPECT_STREQ
+#define EXPECT_STREQ(X, Y) EXPECT_EQ(strcmp(X, Y), 0)
+#endif
+
+extern bool UseQuarantine;
diff --git a/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp
index 60bd5648eef71..e771924354edf 100644
--- a/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/scudo_unit_test_main.cpp
@@ -6,9 +6,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+// This allows us to turn on/off a Quarantine for specific tests. The Quarantine
+// parameters are on the low end, to avoid having to loop excessively in some
+// tests.
+bool UseQuarantine = true;
+extern "C" __attribute__((visibility("default"))) const char *
+__scudo_default_options() {
+  if (!UseQuarantine)
+    return "dealloc_type_mismatch=true";
+  return "quarantine_size_kb=256:thread_local_quarantine_size_kb=128:"
+         "quarantine_max_chunk_size=512:dealloc_type_mismatch=true";
+}
 
 int main(int argc, char **argv) {
+#if !SCUDO_FUCHSIA
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
+#else
+  return RUN_ALL_TESTS(argc, argv);
+#endif
 }
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 047a61653cb2b..1e7dcec5861fe 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "secondary.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "secondary.h"
 
 #include <stdio.h>
 
@@ -16,6 +16,7 @@
 #include <mutex>
 #include <random>
 #include <thread>
+#include <vector>
 
 template <class SecondaryT> static void testSecondaryBasic(void) {
   scudo::GlobalStats S;
@@ -54,12 +55,18 @@ template <class SecondaryT> static void testSecondaryBasic(void) {
 }
 
 TEST(ScudoSecondaryTest, SecondaryBasic) {
-  testSecondaryBasic<scudo::MapAllocator<>>();
   testSecondaryBasic<scudo::MapAllocator<0U>>();
+#if !SCUDO_FUCHSIA
+  testSecondaryBasic<scudo::MapAllocator<>>();
   testSecondaryBasic<scudo::MapAllocator<64U>>();
+#endif
 }
 
+#if SCUDO_FUCHSIA
+using LargeAllocator = scudo::MapAllocator<0U>;
+#else
 using LargeAllocator = scudo::MapAllocator<>;
+#endif
 
 // This exercises a variety of combinations of size and alignment for the
 // MapAllocator. The size computation done here mimic the ones done by the
diff --git a/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp b/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
index 39babc14902e4..55850400a7650 100644
--- a/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/size_class_map.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "size_class_map.h"
 
 template <class SizeClassMap> void testSizeClassMap() {
   typedef SizeClassMap SCMap;
diff --git a/compiler-rt/lib/scudo/standalone/tests/stats_test.cpp b/compiler-rt/lib/scudo/standalone/tests/stats_test.cpp
index 449c1491d5558..cdadfbad3cbc2 100644
--- a/compiler-rt/lib/scudo/standalone/tests/stats_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/stats_test.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/stats.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "stats.h"
 
 TEST(ScudoStatsTest, LocalStats) {
   scudo::LocalStats LStats;
diff --git a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
index 3b1a5e8743e60..eed174dc586a4 100644
--- a/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/strings_test.cpp
@@ -6,8 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "scudo/standalone/string_utils.h"
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
+
+#include "string_utils.h"
 
 #include <limits.h>
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
index 1941723d5d04f..b32c62fe6ca16 100644
--- a/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/tsd_test.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "tests/scudo_unit_test.h"
+
 #include "tsd_exclusive.h"
 #include "tsd_shared.h"
 
-#include "gtest/gtest.h"
-
 #include <condition_variable>
 #include <mutex>
 #include <thread>
@@ -108,7 +108,9 @@ template <class AllocatorT> static void testRegistry() {
 TEST(ScudoTSDTest, TSDRegistryBasic) {
   testRegistry<MockAllocator<OneCache>>();
   testRegistry<MockAllocator<SharedCaches>>();
+#if !SCUDO_FUCHSIA
   testRegistry<MockAllocator<ExclusiveCaches>>();
+#endif
 }
 
 static std::mutex Mutex;
@@ -164,5 +166,7 @@ template <class AllocatorT> static void testRegistryThreaded() {
 TEST(ScudoTSDTest, TSDRegistryThreaded) {
   testRegistryThreaded<MockAllocator<OneCache>>();
   testRegistryThreaded<MockAllocator<SharedCaches>>();
+#if !SCUDO_FUCHSIA
   testRegistryThreaded<MockAllocator<ExclusiveCaches>>();
+#endif
 }
diff --git a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
index 946a44eee8e50..d2c6a9b6bb3cc 100644
--- a/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/vector_test.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "vector.h"
+#include "tests/scudo_unit_test.h"
 
-#include "gtest/gtest.h"
+#include "vector.h"
 
 TEST(ScudoVectorTest, Basic) {
   scudo::Vector<int> V;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index cb651f265f027..99e7aa2fa21cd 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "platform.h"
-
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
 
+#include <errno.h>
 #include <limits.h>
 #include <malloc.h>
 #include <stdlib.h>
@@ -32,11 +31,6 @@ int malloc_iterate(uintptr_t base, size_t size,
 // We have to use a small quarantine to make sure that our double-free tests
 // trigger. Otherwise EXPECT_DEATH ends up reallocating the chunk that was just
 // freed (this depends on the size obviously) and the following free succeeds.
-extern "C" __attribute__((visibility("default"))) const char *
-__scudo_default_options() {
-  return "quarantine_size_kb=256:thread_local_quarantine_size_kb=128:"
-         "quarantine_max_chunk_size=512";
-}
 
 static const size_t Size = 100U;
 
@@ -200,6 +194,7 @@ TEST(ScudoWrappersCTest, Realloc) {
 #define M_PURGE -101
 #endif
 
+#if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallOpt) {
   errno = 0;
   EXPECT_EQ(mallopt(-1000, 1), 0);
@@ -213,8 +208,10 @@ TEST(ScudoWrappersCTest, MallOpt) {
   EXPECT_EQ(mallopt(M_DECAY_TIME, 1), 1);
   EXPECT_EQ(mallopt(M_DECAY_TIME, 0), 1);
 }
+#endif
 
 TEST(ScudoWrappersCTest, OtherAlloc) {
+#if !SCUDO_FUCHSIA
   const size_t PageSize = sysconf(_SC_PAGESIZE);
 
   void *P = pvalloc(Size);
@@ -229,10 +226,12 @@ TEST(ScudoWrappersCTest, OtherAlloc) {
   EXPECT_NE(P, nullptr);
   EXPECT_EQ(reinterpret_cast<uintptr_t>(P) & (PageSize - 1), 0U);
   free(P);
+#endif
 
   EXPECT_EQ(valloc(SIZE_MAX), nullptr);
 }
 
+#if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallInfo) {
   const size_t BypassQuarantineSize = 1024U;
 
@@ -248,6 +247,7 @@ TEST(ScudoWrappersCTest, MallInfo) {
   MI = mallinfo();
   EXPECT_GE(static_cast<size_t>(MI.fordblks), Free + BypassQuarantineSize);
 }
+#endif
 
 static uintptr_t BoundaryP;
 static size_t Count;
@@ -282,6 +282,7 @@ TEST(ScudoWrappersCTest, MallocIterateBoundary) {
   free(P);
 }
 
+#if !SCUDO_FUCHSIA
 TEST(ScudoWrappersCTest, MallocInfo) {
   char Buffer[64];
   FILE *F = fmemopen(Buffer, sizeof(Buffer), "w+");
@@ -292,3 +293,4 @@ TEST(ScudoWrappersCTest, MallocInfo) {
   fclose(F);
   EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0);
 }
+#endif
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
index 5eb65bc893722..28ae41c03f428 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_cpp_test.cpp
@@ -6,11 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "gtest/gtest.h"
+#include "tests/scudo_unit_test.h"
 
 #include <condition_variable>
 #include <mutex>
 #include <thread>
+#include <vector>
 
 void operator delete(void *, size_t) noexcept;
 void operator delete[](void *, size_t) noexcept;
@@ -18,12 +19,6 @@ void operator delete[](void *, size_t) noexcept;
 // Note that every Cxx allocation function in the test binary will be fulfilled
 // by Scudo. See the comment in the C counterpart of this file.
 
-extern "C" __attribute__((visibility("default"))) const char *
-__scudo_default_options() {
-  return "quarantine_size_kb=256:thread_local_quarantine_size_kb=128:"
-         "quarantine_max_chunk_size=512:dealloc_type_mismatch=true";
-}
-
 template <typename T> static void testCxxNew() {
   T *P = new T;
   EXPECT_NE(P, nullptr);
diff --git a/compiler-rt/lib/scudo/standalone/tsd.h b/compiler-rt/lib/scudo/standalone/tsd.h
index f24ff01960fb2..626cc4b80fb7b 100644
--- a/compiler-rt/lib/scudo/standalone/tsd.h
+++ b/compiler-rt/lib/scudo/standalone/tsd.h
@@ -38,7 +38,7 @@ template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
 
   void commitBack(Allocator *Instance) { Instance->commitBack(this); }
 
-  INLINE bool tryLock() {
+  inline bool tryLock() {
     if (Mutex.tryLock()) {
       atomic_store_relaxed(&Precedence, 0);
       return true;
@@ -49,12 +49,12 @@ template <class Allocator> struct ALIGNED(SCUDO_CACHE_LINE_SIZE) TSD {
           static_cast<uptr>(getMonotonicTime() >> FIRST_32_SECOND_64(16, 0)));
     return false;
   }
-  INLINE void lock() {
+  inline void lock() {
     atomic_store_relaxed(&Precedence, 0);
     Mutex.lock();
   }
-  INLINE void unlock() { Mutex.unlock(); }
-  INLINE uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
+  inline void unlock() { Mutex.unlock(); }
+  inline uptr getPrecedence() { return atomic_load_relaxed(&Precedence); }
 
 private:
   HybridMutex Mutex;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index a43cf3fc33769..5f58068edf781 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -50,6 +50,7 @@ template <class Allocator, u32 MaxTSDCount> struct TSDRegistrySharedT {
   void unmapTestOnly() {
     unmap(reinterpret_cast<void *>(TSDs),
           sizeof(TSD<Allocator>) * NumberOfTSDs);
+    setCurrentTSD(nullptr);
   }
 
   ALWAYS_INLINE void initThreadMaybe(Allocator *Instance,
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
index d4370d506e5ea..7fc1a9600e53b 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_checks.h
@@ -20,7 +20,7 @@
 namespace scudo {
 
 // A common errno setting logic shared by almost all Scudo C wrappers.
-INLINE void *setErrnoOnNull(void *Ptr) {
+inline void *setErrnoOnNull(void *Ptr) {
   if (UNLIKELY(!Ptr))
     errno = ENOMEM;
   return Ptr;
@@ -30,14 +30,14 @@ INLINE void *setErrnoOnNull(void *Ptr) {
 
 // Checks aligned_alloc() parameters, verifies that the alignment is a power of
 // two and that the size is a multiple of alignment.
-INLINE bool checkAlignedAllocAlignmentAndSize(uptr Alignment, uptr Size) {
+inline bool checkAlignedAllocAlignmentAndSize(uptr Alignment, uptr Size) {
   return Alignment == 0 || !isPowerOfTwo(Alignment) ||
          !isAligned(Size, Alignment);
 }
 
 // Checks posix_memalign() parameters, verifies that alignment is a power of two
 // and a multiple of sizeof(void *).
-INLINE bool checkPosixMemalignAlignment(uptr Alignment) {
+inline bool checkPosixMemalignAlignment(uptr Alignment) {
   return Alignment == 0 || !isPowerOfTwo(Alignment) ||
          !isAligned(Alignment, sizeof(void *));
 }
@@ -45,7 +45,7 @@ INLINE bool checkPosixMemalignAlignment(uptr Alignment) {
 // Returns true if calloc(Size, N) overflows on Size*N calculation. Use a
 // builtin supported by recent clang & GCC if it exists, otherwise fallback to a
 // costly division.
-INLINE bool checkForCallocOverflow(uptr Size, uptr N, uptr *Product) {
+inline bool checkForCallocOverflow(uptr Size, uptr N, uptr *Product) {
 #if __has_builtin(__builtin_umull_overflow)
   return __builtin_umull_overflow(Size, N, Product);
 #else
@@ -58,7 +58,7 @@ INLINE bool checkForCallocOverflow(uptr Size, uptr N, uptr *Product) {
 
 // Returns true if the size passed to pvalloc overflows when rounded to the next
 // multiple of PageSize.
-INLINE bool checkForPvallocOverflow(uptr Size, uptr PageSize) {
+inline bool checkForPvallocOverflow(uptr Size, uptr PageSize) {
   return roundUpTo(Size, PageSize) < Size;
 }
 
diff --git a/compiler-rt/test/CMakeLists.txt b/compiler-rt/test/CMakeLists.txt
index 02ce6aabd6995..f0330bcfe3041 100644
--- a/compiler-rt/test/CMakeLists.txt
+++ b/compiler-rt/test/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Needed for lit support in standalone builds.
 include(AddLLVM)
 
+pythonize_bool(LLVM_ENABLE_EXPENSIVE_CHECKS)
 configure_compiler_rt_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.common.configured.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.common.configured)
diff --git a/compiler-rt/test/builtins/Unit/addtf3_test.c b/compiler-rt/test/builtins/Unit/addtf3_test.c
index 7ca0355e42adf..dcd4efe9c9015 100644
--- a/compiler-rt/test/builtins/Unit/addtf3_test.c
+++ b/compiler-rt/test/builtins/Unit/addtf3_test.c
@@ -76,7 +76,8 @@ int main()
                      UINT64_C(0x61e58dd6c51eb77c)))
         return 1;
 
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP)
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP) || \
+    defined(i386) || defined(__x86_64__)
     // Rounding mode tests on supported architectures
     const long double m = 1234.0L, n = 0.01L;
 
diff --git a/compiler-rt/test/builtins/Unit/subtf3_test.c b/compiler-rt/test/builtins/Unit/subtf3_test.c
index b95f2ef996d61..265ab642ecf0c 100644
--- a/compiler-rt/test/builtins/Unit/subtf3_test.c
+++ b/compiler-rt/test/builtins/Unit/subtf3_test.c
@@ -69,7 +69,8 @@ int main()
                      UINT64_C(0xa44a7bca780a166c)))
         return 1;
 
-#if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP)
+#if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP) || \
+    defined(i386) || defined(__x86_64__)
     // Rounding mode tests on supported architectures
     const long double m = 1234.02L, n = 0.01L;
 
diff --git a/compiler-rt/test/fuzzer/large.test b/compiler-rt/test/fuzzer/large.test
index b03b60fdb6503..9aa7c46dc42c8 100644
--- a/compiler-rt/test/fuzzer/large.test
+++ b/compiler-rt/test/fuzzer/large.test
@@ -1,3 +1,5 @@
+UNSUPPORTED: expensive_checks
+
 RUN: %cpp_compiler %S/LargeTest.cpp -o %t-LargeTest
 RUN: %run %t-LargeTest -runs=10000
 
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 2a8d93166f695..00f0a1e93abe1 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -477,6 +477,9 @@ def is_windows_lto_supported():
 else:
   config.available_features.add("shadow-scale-3")
 
+if config.expensive_checks:
+  config.available_features.add("expensive_checks")
+
 # Propagate the LLD/LTO into the clang config option, so nothing else is needed.
 run_wrapper = []
 target_cflags = [getattr(config, 'target_cflags', None)]
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 5ca95efd530ce..b4862f74cdd02 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -42,6 +42,7 @@ set_default("android_serial", "@ANDROID_SERIAL_FOR_TESTING@")
 set_default("android_files_to_push", [])
 set_default("have_rpc_xdr_h", @HAVE_RPC_XDR_H@)
 set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
+set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 config.available_features.add('target-is-%s' % config.target_arch)
 
 if config.enable_per_target_runtime_dir:
diff --git a/compiler-rt/test/profile/Windows/Inputs/instrprof-multiprocess.c b/compiler-rt/test/profile/Windows/Inputs/instrprof-multiprocess.c
new file mode 100644
index 0000000000000..774712d39738c
--- /dev/null
+++ b/compiler-rt/test/profile/Windows/Inputs/instrprof-multiprocess.c
@@ -0,0 +1,89 @@
+/* This is a test case where the parent process forks 10 children
+ * which contend to merge profile data to the same file. With
+ * file locking support, the data from each child should not
+ * be lost.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <windows.h>
+
+void spawn_child(PROCESS_INFORMATION *pi, int child_num) {
+  wchar_t child_str[10];
+  _itow(child_num, child_str, 10);
+  if (!SetEnvironmentVariableW(L"CHILD_NUM", child_str)) {
+    printf("SetEnvironmentVariableW failed (0x%8lx).\n", GetLastError());
+    fflush(stdout);
+    exit(1);
+  }
+
+  STARTUPINFOW si;
+  memset(&si, 0, sizeof(si));
+  si.cb = sizeof(si);
+
+  memset(pi, 0, sizeof(PROCESS_INFORMATION));
+
+  if (!CreateProcessW(NULL,              // No module name (use command line)
+                      GetCommandLineW(), // Command line
+                      NULL,              // Process handle not inheritable
+                      NULL,              // Thread handle not inheritable
+                      TRUE,              // Set handle inheritance to TRUE
+                      0,                 // No flags
+                      NULL,              // Use parent's environment block
+                      NULL,              // Use parent's starting directory
+                      &si, pi)) {
+    printf("CreateProcess failed (0x%08lx).\n", GetLastError());
+    fflush(stdout);
+    exit(1);
+  }
+}
+
+int wait_child(PROCESS_INFORMATION *pi) {
+  WaitForSingleObject(pi->hProcess, INFINITE);
+
+  DWORD exit_code;
+  if (!GetExitCodeProcess(pi->hProcess, &exit_code)) {
+    printf("GetExitCodeProcess failed (0x%08lx).\n", GetLastError());
+    fflush(stdout);
+    exit(1);
+  }
+
+  CloseHandle(pi->hProcess);
+  CloseHandle(pi->hThread);
+
+  return exit_code;
+}
+
+#define NUM_CHILDREN 10
+
+int foo(int num) {
+  if (num < (NUM_CHILDREN / 2)) {
+    return 1;
+  } else if (num < NUM_CHILDREN) {
+    return 2;
+  }
+  return 3;
+}
+
+int main(int argc, char *argv[]) {
+  char *child_str = getenv("CHILD_NUM");
+  if (!child_str) {
+    PROCESS_INFORMATION child[NUM_CHILDREN];
+    // In parent
+    for (int i = 0; i < NUM_CHILDREN; i++) {
+      spawn_child(&child[i], i);
+    }
+    for (int i = 0; i < NUM_CHILDREN; i++) {
+      wait_child(&child[i]);
+    }
+    return 0;
+  } else {
+    // In child
+    int child_num = atoi(child_str);
+    int result = foo(child_num);
+    if (result == 3) {
+      fprintf(stderr, "Invalid child count!");
+      return 1;
+    }
+    return 0;
+  }
+}
diff --git a/compiler-rt/test/profile/Windows/instrprof-multiprocess.test b/compiler-rt/test/profile/Windows/instrprof-multiprocess.test
new file mode 100644
index 0000000000000..ae5ebd45bec95
--- /dev/null
+++ b/compiler-rt/test/profile/Windows/instrprof-multiprocess.test
@@ -0,0 +1,10 @@
+RUN: %clang_profgen %S/Inputs/instrprof-multiprocess.c -o %t
+RUN: rm -f %t_*.profraw
+RUN: env LLVM_PROFILE_FILE=%t_%m.profraw %run %t
+RUN: llvm-profdata show --counts -function=foo %t_*.profraw | FileCheck %s
+
+CHECK: Counters:
+CHECK:   foo:
+CHECK:     Function count: 10
+CHECK:     Block counts: [5, 5]
+CHECK: Functions shown: 1
diff --git a/compiler-rt/test/profile/Windows/lit.local.cfg.py b/compiler-rt/test/profile/Windows/lit.local.cfg.py
new file mode 100644
index 0000000000000..e924d91c44934
--- /dev/null
+++ b/compiler-rt/test/profile/Windows/lit.local.cfg.py
@@ -0,0 +1,9 @@
+def getRoot(config):
+  if not config.parent:
+    return config
+  return getRoot(config.parent)
+
+root = getRoot(config)
+
+if root.host_os not in ['Windows']:
+  config.unsupported = True
diff --git a/compiler-rt/test/tsan/race_range_pc.cc b/compiler-rt/test/tsan/race_range_pc.cpp
similarity index 100%
rename from compiler-rt/test/tsan/race_range_pc.cc
rename to compiler-rt/test/tsan/race_range_pc.cpp
diff --git a/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-conversion-incdec.c b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-conversion-incdec.c
new file mode 100644
index 0000000000000..0e62c02d3affb
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-conversion-incdec.c
@@ -0,0 +1,122 @@
+// RUN: %clang   -x c   -fsanitize=implicit-conversion -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-conversion -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-conversion -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-conversion -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-conversion -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+void test_unsigned() {
+  unsigned char x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  x = 0;
+  --x;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 254;
+  x++;
+  x = 254;
+  ++x;
+
+  x = 254;
+  x--;
+  x = 254;
+  --x;
+
+  x = 255;
+  x++;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  x = 255;
+  ++x;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+
+  x = 255;
+  x--;
+  x = 255;
+  --x;
+}
+
+void test_signed() {
+  signed char x;
+
+  x = -128;
+  x++;
+  x = -128;
+  ++x;
+
+  x = -128;
+  x--;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+  x = -128;
+  --x;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+
+  x = -1;
+  x++;
+  x = -1;
+  ++x;
+
+  x = -1;
+  x--;
+  x = -1;
+  --x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  x = 0;
+  --x;
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 127;
+  x++;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+  x = 127;
+  ++x;
+  // CHECK: {{.*}}integer-conversion-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+
+  x = 127;
+  x--;
+  x = 127;
+  --x;
+}
+
+int main() {
+  test_unsigned();
+  test_signed();
+
+  return 0;
+}
diff --git a/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-sign-change-incdec.c b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-sign-change-incdec.c
new file mode 100644
index 0000000000000..4b56a105aa289
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/integer-sign-change-incdec.c
@@ -0,0 +1,120 @@
+// RUN: %clang   -x c   -fsanitize=implicit-integer-sign-change -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-integer-sign-change -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-integer-sign-change -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-integer-sign-change -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-sign-change -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-sign-change -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-sign-change -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-integer-sign-change -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+void test_unsigned() {
+  unsigned char x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  x = 0;
+  --x;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 254;
+  x++;
+  x = 254;
+  ++x;
+
+  x = 254;
+  x--;
+  x = 254;
+  --x;
+
+  x = 255;
+  x++;
+  x = 255;
+  ++x;
+
+  x = 255;
+  x--;
+  x = 255;
+  --x;
+}
+
+void test_signed() {
+  signed char x;
+
+  x = -128;
+  x++;
+  x = -128;
+  ++x;
+
+  x = -128;
+  x--;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+  x = -128;
+  --x;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+
+  x = -1;
+  x++;
+  x = -1;
+  ++x;
+
+  x = -1;
+  x--;
+  x = -1;
+  --x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  x = 0;
+  --x;
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 127;
+  x++;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+  x = 127;
+  ++x;
+  // CHECK: {{.*}}integer-sign-change-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+
+  x = 127;
+  x--;
+  x = 127;
+  --x;
+}
+
+int main() {
+  test_unsigned();
+  test_signed();
+
+  return 0;
+}
diff --git a/compiler-rt/test/ubsan/TestCases/ImplicitConversion/signed-integer-truncation-incdec.c b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/signed-integer-truncation-incdec.c
new file mode 100644
index 0000000000000..4806efb24eb13
--- /dev/null
+++ b/compiler-rt/test/ubsan/TestCases/ImplicitConversion/signed-integer-truncation-incdec.c
@@ -0,0 +1,122 @@
+// RUN: %clang   -x c   -fsanitize=implicit-signed-integer-truncation -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-signed-integer-truncation -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-signed-integer-truncation -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c   -fsanitize=implicit-signed-integer-truncation -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+// RUN: %clang   -x c++ -fsanitize=implicit-signed-integer-truncation -O0 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-signed-integer-truncation -O1 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-signed-integer-truncation -O2 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+// RUN: %clang   -x c++ -fsanitize=implicit-signed-integer-truncation -O3 %s -o %t && %run %t 2>&1 | FileCheck %s --implicit-check-not="implicit conversion" --check-prefixes=CHECK
+
+void test_unsigned() {
+  unsigned char x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+  x = 0;
+  --x;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -1 (32-bit, signed) to type 'unsigned char' changed the value to 255 (8-bit, unsigned)
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 254;
+  x++;
+  x = 254;
+  ++x;
+
+  x = 254;
+  x--;
+  x = 254;
+  --x;
+
+  x = 255;
+  x++;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+  x = 255;
+  ++x;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value 256 (32-bit, signed) to type 'unsigned char' changed the value to 0 (8-bit, unsigned)
+
+  x = 255;
+  x--;
+  x = 255;
+  --x;
+}
+
+void test_signed() {
+  signed char x;
+
+  x = -128;
+  x++;
+  x = -128;
+  ++x;
+
+  x = -128;
+  x--;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+  x = -128;
+  --x;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value -129 (32-bit, signed) to type 'signed char' changed the value to 127 (8-bit, signed)
+
+  x = -1;
+  x++;
+  x = -1;
+  ++x;
+
+  x = -1;
+  x--;
+  x = -1;
+  --x;
+
+  x = 0;
+  x++;
+  x = 0;
+  ++x;
+
+  x = 0;
+  x--;
+  x = 0;
+  --x;
+
+  x = 1;
+  x++;
+  x = 1;
+  ++x;
+
+  x = 1;
+  x--;
+  x = 1;
+  --x;
+
+  x = 127;
+  x++;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:4: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+  x = 127;
+  ++x;
+  // CHECK: {{.*}}signed-integer-truncation-incdec.c:[[@LINE-1]]:3: runtime error: implicit conversion from type 'int' of value 128 (32-bit, signed) to type 'signed char' changed the value to -128 (8-bit, signed)
+
+  x = 127;
+  x--;
+  x = 127;
+  --x;
+}
+
+int main() {
+  test_unsigned();
+  test_signed();
+
+  return 0;
+}
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bfbba319d7208..fa488da0885fb 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -565,7 +565,12 @@ function(cxx_add_basic_build_flags target)
   endif()
 
   if (LIBCXX_HAS_COMMENT_LIB_PRAGMA)
-    target_compile_definitions(${target} PRIVATE -D_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+    if (LIBCXX_HAS_PTHREAD_LIB)
+      target_compile_definitions(${target} PRIVATE -D_LIBCPP_LINK_PTHREAD_LIB)
+    endif()
+    if (LIBCXX_HAS_RT_LIB)
+      target_compile_definitions(${target} PRIVATE -D_LIBCPP_LINK_RT_LIB)
+    endif()
   endif()
 endfunction()
 
diff --git a/libcxx/include/string b/libcxx/include/string
index c16dbedc51c0f..4e0b21135a7e6 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -2289,10 +2289,20 @@ basic_string<_CharT, _Traits, _Allocator>::__move_assign(basic_string& __str, tr
     _NOEXCEPT_(is_nothrow_move_assignable<allocator_type>::value)
 #endif
 {
-    __clear_and_shrink();
-    __r_.first() = __str.__r_.first();
-    __move_assign_alloc(__str);
-    __str.__zero();
+  if (__is_long()) {
+    __alloc_traits::deallocate(__alloc(), __get_long_pointer(),
+                               __get_long_cap());
+#if _LIBCPP_STD_VER <= 14
+    if (!is_nothrow_move_assignable<allocator_type>::value) {
+      __set_short_size(0);
+      traits_type::assign(__get_short_pointer()[0], value_type());
+    }
+#endif
+  }
+  __move_assign_alloc(__str);
+  __r_.first() = __str.__r_.first();
+  __str.__set_short_size(0);
+  traits_type::assign(__str.__get_short_pointer()[0], value_type());
 }
 
 template <class _CharT, class _Traits, class _Allocator>
diff --git a/libcxx/src/algorithm.cpp b/libcxx/src/algorithm.cpp
index ffdcb5fccde7a..40669fb9e7561 100644
--- a/libcxx/src/algorithm.cpp
+++ b/libcxx/src/algorithm.cpp
@@ -10,7 +10,7 @@
 #include "random"
 #ifndef _LIBCPP_HAS_NO_THREADS
 #include "mutex"
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxx/src/chrono.cpp b/libcxx/src/chrono.cpp
index 2d78caea61c35..9d448b6a985bc 100644
--- a/libcxx/src/chrono.cpp
+++ b/libcxx/src/chrono.cpp
@@ -37,7 +37,7 @@
 #endif
 #endif
 
-#if defined(__unix__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_RT_LIB)
 #pragma comment(lib, "rt")
 #endif
 
diff --git a/libcxx/src/condition_variable.cpp b/libcxx/src/condition_variable.cpp
index bf89d255dd823..d133b010d71f4 100644
--- a/libcxx/src/condition_variable.cpp
+++ b/libcxx/src/condition_variable.cpp
@@ -15,7 +15,7 @@
 #include "system_error"
 #include "__undef_macros"
 
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 
diff --git a/libcxx/src/debug.cpp b/libcxx/src/debug.cpp
index 1f5ce1052f87d..20055fcf7590e 100644
--- a/libcxx/src/debug.cpp
+++ b/libcxx/src/debug.cpp
@@ -15,7 +15,7 @@
 #include "__hash_table"
 #ifndef _LIBCPP_HAS_NO_THREADS
 #include "mutex"
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxx/src/experimental/memory_resource.cpp b/libcxx/src/experimental/memory_resource.cpp
index e987262831736..68c5bc99cc72a 100644
--- a/libcxx/src/experimental/memory_resource.cpp
+++ b/libcxx/src/experimental/memory_resource.cpp
@@ -12,7 +12,7 @@
 #include "atomic"
 #elif !defined(_LIBCPP_HAS_NO_THREADS)
 #include "mutex"
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index 08a6b2b86e26a..876399fb4d4e5 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -44,7 +44,7 @@
 #include <sys/time.h> // for gettimeofday and timeval
 #endif                // !defined(CLOCK_REALTIME)
 
-#if defined(__unix__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_RT_LIB)
 #pragma comment(lib, "rt")
 #endif
 
diff --git a/libcxx/src/memory.cpp b/libcxx/src/memory.cpp
index e89d94f27e4cd..633c9a6f56580 100644
--- a/libcxx/src/memory.cpp
+++ b/libcxx/src/memory.cpp
@@ -10,7 +10,7 @@
 #ifndef _LIBCPP_HAS_NO_THREADS
 #include "mutex"
 #include "thread"
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxx/src/mutex.cpp b/libcxx/src/mutex.cpp
index 7e979cd890413..27a4fd8927785 100644
--- a/libcxx/src/mutex.cpp
+++ b/libcxx/src/mutex.cpp
@@ -13,7 +13,7 @@
 #include "__undef_macros"
 
 #ifndef _LIBCPP_HAS_NO_THREADS
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxx/src/shared_mutex.cpp b/libcxx/src/shared_mutex.cpp
index 9e6d5202aafaf..5feef9f4889f4 100644
--- a/libcxx/src/shared_mutex.cpp
+++ b/libcxx/src/shared_mutex.cpp
@@ -10,7 +10,7 @@
 #ifndef _LIBCPP_HAS_NO_THREADS
 
 #include "shared_mutex"
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 
diff --git a/libcxx/src/thread.cpp b/libcxx/src/thread.cpp
index 967a53735accb..c0bc1cbbbbc32 100644
--- a/libcxx/src/thread.cpp
+++ b/libcxx/src/thread.cpp
@@ -35,7 +35,7 @@
 #include <windows.h>
 #endif
 
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCPP_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCPP_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 07657ea6e4f8f..befe75c20e76b 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -915,6 +915,7 @@ def configure_warnings(self):
         self.cxx.addWarningFlagIfSupported('-Wshadow')
         self.cxx.addWarningFlagIfSupported('-Wno-unused-command-line-argument')
         self.cxx.addWarningFlagIfSupported('-Wno-attributes')
+        self.cxx.addWarningFlagIfSupported('-Wno-deprecated-copy')
         self.cxx.addWarningFlagIfSupported('-Wno-constant-evaluated')
         self.cxx.addWarningFlagIfSupported('-Wno-pessimizing-move')
         self.cxx.addWarningFlagIfSupported('-Wno-c++11-extensions')
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index d914b6e02905e..0ddcd5f971f7f 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -456,7 +456,9 @@ if (LIBCXXABI_BAREMETAL)
 endif()
 
 if (LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
-  add_definitions(-D_LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
+  if (LIBCXXABI_HAS_PTHREAD_LIB)
+    add_definitions(-D_LIBCXXABI_LINK_PTHREAD_LIB)
+  endif()
 endif()
 
 string(REPLACE ";" " " LIBCXXABI_CXX_FLAGS "${LIBCXXABI_CXX_FLAGS}")
diff --git a/libcxxabi/src/cxa_exception_storage.cpp b/libcxxabi/src/cxa_exception_storage.cpp
index 28c0122ff0746..24ff55e39d291 100644
--- a/libcxxabi/src/cxa_exception_storage.cpp
+++ b/libcxxabi/src/cxa_exception_storage.cpp
@@ -46,7 +46,7 @@ extern "C" {
 #include "abort_message.h"
 #include "fallback_malloc.h"
 
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 
diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h
index 98e42ba2fb0b0..a8ec0b72feea2 100644
--- a/libcxxabi/src/cxa_guard_impl.h
+++ b/libcxxabi/src/cxa_guard_impl.h
@@ -50,7 +50,7 @@
 #include <stdlib.h>
 #include <__threading_support>
 #ifndef _LIBCXXABI_HAS_NO_THREADS
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxxabi/src/cxa_thread_atexit.cpp b/libcxxabi/src/cxa_thread_atexit.cpp
index 923b265b27c2c..a940eaf2f9cc3 100644
--- a/libcxxabi/src/cxa_thread_atexit.cpp
+++ b/libcxxabi/src/cxa_thread_atexit.cpp
@@ -10,7 +10,7 @@
 #include "cxxabi.h"
 #include <__threading_support>
 #ifndef _LIBCXXABI_HAS_NO_THREADS
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libcxxabi/src/fallback_malloc.cpp b/libcxxabi/src/fallback_malloc.cpp
index 8f301bcacd14c..fdae40764abef 100644
--- a/libcxxabi/src/fallback_malloc.cpp
+++ b/libcxxabi/src/fallback_malloc.cpp
@@ -13,7 +13,7 @@
 
 #include <__threading_support>
 #ifndef _LIBCXXABI_HAS_NO_THREADS
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBCXXABI_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index 25dc95cf6ba76..08095d1333a56 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -352,7 +352,12 @@ if (WIN32 AND LIBUNWIND_ENABLE_STATIC AND NOT LIBUNWIND_ENABLE_SHARED)
 endif()
 
 if (LIBUNWIND_HAS_COMMENT_LIB_PRAGMA)
-  add_definitions(-D_LIBUNWIND_HAS_COMMENT_LIB_PRAGMA)
+  if (LIBUNWIND_HAS_DL_LIB)
+    add_definitions(-D_LIBUNWIND_LINK_DL_LIB)
+  endif()
+  if (LIBUNWIND_HAS_PTHREAD_LIB)
+    add_definitions(-D_LIBUNWIND_LINK_PTHREAD_LIB)
+  endif()
 endif()
 
 #===============================================================================
diff --git a/libunwind/src/AddressSpace.hpp b/libunwind/src/AddressSpace.hpp
index db67df4dc80ac..7433476f91172 100644
--- a/libunwind/src/AddressSpace.hpp
+++ b/libunwind/src/AddressSpace.hpp
@@ -27,7 +27,7 @@
 
 #if _LIBUNWIND_USE_DLADDR
 #include <dlfcn.h>
-#if defined(__unix__) && defined(__ELF__) && defined(_LIBUNWIND_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBUNWIND_LINK_DL_LIB)
 #pragma comment(lib, "dl")
 #endif
 #endif
diff --git a/libunwind/src/RWMutex.hpp b/libunwind/src/RWMutex.hpp
index 954e94c322d45..fcd3f4967d17f 100644
--- a/libunwind/src/RWMutex.hpp
+++ b/libunwind/src/RWMutex.hpp
@@ -17,7 +17,7 @@
 #include <windows.h>
 #elif !defined(_LIBUNWIND_HAS_NO_THREADS)
 #include <pthread.h>
-#if defined(__unix__) && !defined(__ANDROID__) && defined(__ELF__) && defined(_LIBUNWIND_HAS_COMMENT_LIB_PRAGMA)
+#if defined(__ELF__) && defined(_LIBUNWIND_LINK_PTHREAD_LIB)
 #pragma comment(lib, "pthread")
 #endif
 #endif
diff --git a/lld/ELF/Arch/AArch64.cpp b/lld/ELF/Arch/AArch64.cpp
index 5cf07029fa1d5..4e80e3d78f167 100644
--- a/lld/ELF/Arch/AArch64.cpp
+++ b/lld/ELF/Arch/AArch64.cpp
@@ -40,7 +40,8 @@ class AArch64 : public TargetInfo {
   void writePlt(uint8_t *buf, uint64_t gotPltEntryAddr, uint64_t pltEntryAddr,
                 int32_t index, unsigned relOff) const override;
   bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                  uint64_t branchAddr, const Symbol &s) const override;
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
   uint32_t getThunkSectionSpacing() const override;
   bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   bool usesOnlyLowPageBits(RelType type) const override;
@@ -230,13 +231,14 @@ void AArch64::writePlt(uint8_t *buf, uint64_t gotPltEntryAddr,
 }
 
 bool AArch64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                         uint64_t branchAddr, const Symbol &s) const {
+                         uint64_t branchAddr, const Symbol &s,
+                         int64_t a) const {
   // ELF for the ARM 64-bit architecture, section Call and Jump relocations
   // only permits range extension thunks for R_AARCH64_CALL26 and
   // R_AARCH64_JUMP26 relocation types.
   if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26)
     return false;
-  uint64_t dst = (expr == R_PLT_PC) ? s.getPltVA() : s.getVA();
+  uint64_t dst = expr == R_PLT_PC ? s.getPltVA() : s.getVA(a);
   return !inBranchRange(type, branchAddr, dst);
 }
 
diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index 41baea496d369..0f522d324ff74 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -39,7 +39,8 @@ class ARM final : public TargetInfo {
   void addPltSymbols(InputSection &isec, uint64_t off) const override;
   void addPltHeaderSymbols(InputSection &isd) const override;
   bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                  uint64_t branchAddr, const Symbol &s) const override;
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
   uint32_t getThunkSectionSpacing() const override;
   bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
@@ -262,7 +263,7 @@ void ARM::addPltSymbols(InputSection &isec, uint64_t off) const {
 }
 
 bool ARM::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                     uint64_t branchAddr, const Symbol &s) const {
+                     uint64_t branchAddr, const Symbol &s, int64_t /*a*/) const {
   // If S is an undefined weak symbol and does not have a PLT entry then it
   // will be resolved as a branch to the next instruction.
   if (s.isUndefWeak() && !s.isInPlt())
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index 74c0b59ecd5b9..317b22ec264c8 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -35,7 +35,8 @@ template <class ELFT> class MIPS final : public TargetInfo {
   void writePlt(uint8_t *buf, uint64_t gotPltEntryAddr, uint64_t pltEntryAddr,
                 int32_t index, unsigned relOff) const override;
   bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                  uint64_t branchAddr, const Symbol &s) const override;
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
   void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
   bool usesOnlyLowPageBits(RelType type) const override;
 };
@@ -356,7 +357,8 @@ void MIPS<ELFT>::writePlt(uint8_t *buf, uint64_t gotPltEntryAddr,
 
 template <class ELFT>
 bool MIPS<ELFT>::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                            uint64_t branchAddr, const Symbol &s) const {
+                            uint64_t branchAddr, const Symbol &s,
+                            int64_t /*a*/) const {
   // Any MIPS PIC code function is invoked with its address in register $t9.
   // So if we have a branch instruction from non-PIC code to the PIC one
   // we cannot make the jump directly and need to create a small stubs
diff --git a/lld/ELF/Arch/PPC.cpp b/lld/ELF/Arch/PPC.cpp
index c4eecb9a29c22..b0d93c6ce9b5c 100644
--- a/lld/ELF/Arch/PPC.cpp
+++ b/lld/ELF/Arch/PPC.cpp
@@ -37,7 +37,8 @@ class PPC final : public TargetInfo {
   }
   void writeGotPlt(uint8_t *buf, const Symbol &s) const override;
   bool needsThunk(RelExpr expr, RelType relocType, const InputFile *file,
-                  uint64_t branchAddr, const Symbol &s) const override;
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
   uint32_t getThunkSectionSpacing() const override;
   bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
@@ -169,7 +170,7 @@ void PPC::writeGotPlt(uint8_t *buf, const Symbol &s) const {
 }
 
 bool PPC::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                     uint64_t branchAddr, const Symbol &s) const {
+                     uint64_t branchAddr, const Symbol &s, int64_t /*a*/) const {
   if (type != R_PPC_REL24 && type != R_PPC_PLTREL24)
     return false;
   if (s.isInPlt())
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 6299fd8a52436..ed16974af8679 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -205,7 +205,8 @@ class PPC64 final : public TargetInfo {
   void relocateOne(uint8_t *loc, RelType type, uint64_t val) const override;
   void writeGotHeader(uint8_t *buf) const override;
   bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                  uint64_t branchAddr, const Symbol &s) const override;
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
   uint32_t getThunkSectionSpacing() const override;
   bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   RelExpr adjustRelaxExpr(RelType type, const uint8_t *data,
@@ -898,7 +899,7 @@ void PPC64::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
 }
 
 bool PPC64::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                       uint64_t branchAddr, const Symbol &s) const {
+                       uint64_t branchAddr, const Symbol &s, int64_t /*a*/) const {
   if (type != R_PPC64_REL14 && type != R_PPC64_REL24)
     return false;
 
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b13bb5e00def3..a0987259d24ba 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1408,13 +1408,13 @@ static void handleUndefinedGlob(StringRef arg) {
   }
 
   std::vector<Symbol *> syms;
-  symtab->forEachSymbol([&](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols()) {
     // Calling Sym->fetch() from here is not safe because it may
     // add new symbols to the symbol table, invalidating the
     // current iterator. So we just keep a note.
     if (pat->match(sym->getName()))
       syms.push_back(sym);
-  });
+  }
 
   for (Symbol *sym : syms)
     handleUndefined(sym);
@@ -1440,10 +1440,10 @@ static void handleLibcall(StringRef name) {
 // result, the passes after the symbol resolution won't see any
 // symbols of type CommonSymbol.
 static void replaceCommonSymbols() {
-  symtab->forEachSymbol([](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols()) {
     auto *s = dyn_cast<CommonSymbol>(sym);
     if (!s)
-      return;
+      continue;
 
     auto *bss = make<BssSection>("COMMON", s->size, s->alignment);
     bss->file = s->file;
@@ -1451,7 +1451,7 @@ static void replaceCommonSymbols() {
     inputSections.push_back(bss);
     s->replace(Defined{s->file, s->getName(), s->binding, s->stOther, s->type,
                        /*value=*/0, s->size, bss});
-  });
+  }
 }
 
 // If all references to a DSO happen to be weak, the DSO is not added
@@ -1459,15 +1459,15 @@ static void replaceCommonSymbols() {
 // created from the DSO. Otherwise, they become dangling references
 // that point to a non-existent DSO.
 static void demoteSharedSymbols() {
-  symtab->forEachSymbol([](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols()) {
     auto *s = dyn_cast<SharedSymbol>(sym);
     if (!s || s->getFile().isNeeded)
-      return;
+      continue;
 
     bool used = s->used;
     s->replace(Undefined{nullptr, s->getName(), STB_WEAK, s->stOther, s->type});
     s->used = used;
-  });
+  }
 }
 
 // The section referred to by `s` is considered address-significant. Set the
@@ -1503,10 +1503,9 @@ static void findKeepUniqueSections(opt::InputArgList &args) {
 
   // Symbols in the dynsym could be address-significant in other executables
   // or DSOs, so we conservatively mark them as address-significant.
-  symtab->forEachSymbol([&](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols())
     if (sym->includeInDynsym())
       markAddrsig(sym);
-  });
 
   // Visit the address-significance table in each object file and mark each
   // referenced symbol as address-significant.
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 6da409568c8b1..524d552b0b84d 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -145,12 +145,12 @@ BitcodeCompiler::BitcodeCompiler() {
                                        config->ltoPartitions);
 
   // Initialize usedStartStop.
-  symtab->forEachSymbol([&](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols()) {
     StringRef s = sym->getName();
     for (StringRef prefix : {"__start_", "__stop_"})
       if (s.startswith(prefix))
         usedStartStop.insert(s.substr(prefix.size()));
-  });
+  }
 }
 
 BitcodeCompiler::~BitcodeCompiler() = default;
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index cebbd89168be5..a1561d2d41591 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -442,7 +442,7 @@ LinkerScript::computeInputSections(const InputSectionDescription *cmd) {
 }
 
 void LinkerScript::discard(InputSectionBase *s) {
-  if (s == in.shStrTab || s == mainPart->relaDyn || s == mainPart->relrDyn)
+  if (s == in.shStrTab || s == mainPart->relrDyn)
     error("discarding " + s->name + " section is not allowed");
 
   // You can discard .hash and .gnu.hash sections by linker scripts. Since
diff --git a/lld/ELF/MarkLive.cpp b/lld/ELF/MarkLive.cpp
index 62fb8fe83a2ef..bb0105c289282 100644
--- a/lld/ELF/MarkLive.cpp
+++ b/lld/ELF/MarkLive.cpp
@@ -219,10 +219,9 @@ template <class ELFT> void MarkLive<ELFT>::run() {
 
   // Preserve externally-visible symbols if the symbols defined by this
   // file can interrupt other ELF file's symbols at runtime.
-  symtab->forEachSymbol([&](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols())
     if (sym->includeInDynsym() && sym->partition == partition)
       markSymbol(sym);
-  });
 
   // If this isn't the main partition, that's all that we need to preserve.
   if (partition != 1) {
@@ -330,11 +329,10 @@ template <class ELFT> void markLive() {
       sec->markLive();
 
     // If a DSO defines a symbol referenced in a regular object, it is needed.
-    symtab->forEachSymbol([](Symbol *sym) {
+    for (Symbol *sym : symtab->symbols())
       if (auto *s = dyn_cast<SharedSymbol>(sym))
         if (s->isUsedInRegularObj && !s->isWeak())
           s->getFile().isNeeded = true;
-    });
     return;
   }
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
index cc051dba0e0aa..8d328626b85f9 100644
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -272,7 +272,12 @@ template <class ELFT> void OutputSection::maybeCompress() {
   // Write section contents to a temporary buffer and compress it.
   std::vector<uint8_t> buf(size);
   writeTo<ELFT>(buf.data());
-  if (Error e = zlib::compress(toStringRef(buf), compressedData))
+  // We chose 1 as the default compression level because it is the fastest. If
+  // -O2 is given, we use level 6 to compress debug info more by ~15%. We found
+  // that level 7 to 9 doesn't make much difference (~1% more compression) while
+  // they take significant amount of time (~2x), so level 6 seems enough.
+  if (Error e = zlib::compress(toStringRef(buf), compressedData,
+                               config->optimize >= 2 ? 6 : 1))
     fatal("compress failed: " + llvm::toString(std::move(e)));
 
   // Update section headers.
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index a4fc1ffbd1e72..ea30662d38249 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -777,6 +777,14 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym,
       return s;
   }
 
+  // Case mismatch, e.g. Foo vs FOO.
+  for (auto &it : map)
+    if (name.equals_lower(it.first))
+      return it.second;
+  for (Symbol *sym : symtab->symbols())
+    if (!sym->isUndefined() && name.equals_lower(sym->getName()))
+      return sym;
+
   // The reference may be a mangled name while the definition is not. Suggest a
   // missing extern "C".
   if (name.startswith("_Z")) {
@@ -799,10 +807,11 @@ static const Symbol *getAlternativeSpelling(const Undefined &sym,
         break;
       }
     if (!s)
-      symtab->forEachSymbol([&](Symbol *sym) {
-        if (!s && canSuggestExternCForCXX(name, sym->getName()))
+      for (Symbol *sym : symtab->symbols())
+        if (canSuggestExternCForCXX(name, sym->getName())) {
           s = sym;
-      });
+          break;
+        }
     if (s) {
       pre_hint = " to declare ";
       post_hint = " as extern \"C\"?";
@@ -1754,23 +1763,43 @@ static bool isThunkSectionCompatible(InputSection *source,
   return true;
 }
 
+static int64_t getPCBias(RelType type) {
+  if (config->emachine != EM_ARM)
+    return 0;
+  switch (type) {
+  case R_ARM_THM_JUMP19:
+  case R_ARM_THM_JUMP24:
+  case R_ARM_THM_CALL:
+    return 4;
+  default:
+    return 8;
+  }
+}
+
 std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
                                                 Relocation &rel, uint64_t src) {
   std::vector<Thunk *> *thunkVec = nullptr;
+  int64_t addend = rel.addend + getPCBias(rel.type);
 
-  // We use (section, offset) pair to find the thunk position if possible so
-  // that we create only one thunk for aliased symbols or ICFed sections.
+  // We use a ((section, offset), addend) pair to find the thunk position if
+  // possible so that we create only one thunk for aliased symbols or ICFed
+  // sections. There may be multiple relocations sharing the same (section,
+  // offset + addend) pair. We may revert the relocation back to its original
+  // non-Thunk target, so we cannot fold offset + addend.
   if (auto *d = dyn_cast<Defined>(rel.sym))
     if (!d->isInPlt() && d->section)
-      thunkVec = &thunkedSymbolsBySection[{d->section->repl, d->value}];
+      thunkVec = &thunkedSymbolsBySectionAndAddend[{
+          {d->section->repl, d->value}, addend}];
   if (!thunkVec)
-    thunkVec = &thunkedSymbols[rel.sym];
+    thunkVec = &thunkedSymbols[{rel.sym, addend}];
 
   // Check existing Thunks for Sym to see if they can be reused
   for (Thunk *t : *thunkVec)
     if (isThunkSectionCompatible(isec, t->getThunkTargetSym()->section) &&
         t->isCompatibleWith(*isec, rel) &&
-        target->inBranchRange(rel.type, src, t->getThunkTargetSym()->getVA()))
+        target->inBranchRange(rel.type, src,
+                              t->getThunkTargetSym()->getVA(rel.addend) +
+                                  getPCBias(rel.type)))
       return std::make_pair(t, false);
 
   // No existing compatible Thunk in range, create a new one
@@ -1785,9 +1814,13 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
 // relocation back to its original non-Thunk target.
 bool ThunkCreator::normalizeExistingThunk(Relocation &rel, uint64_t src) {
   if (Thunk *t = thunks.lookup(rel.sym)) {
-    if (target->inBranchRange(rel.type, src, rel.sym->getVA()))
+    if (target->inBranchRange(rel.type, src,
+                              rel.sym->getVA(rel.addend) + getPCBias(rel.type)))
       return true;
     rel.sym = &t->destination;
+    // TODO Restore addend on all targets.
+    if (config->emachine == EM_AARCH64)
+      rel.addend = t->addend;
     if (rel.sym->isInPlt())
       rel.expr = toPlt(rel.expr);
   }
@@ -1843,7 +1876,7 @@ bool ThunkCreator::createThunks(ArrayRef<OutputSection *> outputSections) {
               continue;
 
             if (!target->needsThunk(rel.expr, rel.type, isec->file, src,
-                                    *rel.sym))
+                                    *rel.sym, rel.addend))
               continue;
 
             Thunk *t;
@@ -1865,9 +1898,13 @@ bool ThunkCreator::createThunks(ArrayRef<OutputSection *> outputSections) {
             rel.sym = t->getThunkTargetSym();
             rel.expr = fromPlt(rel.expr);
 
+            // On AArch64, a jump/call relocation may be encoded as STT_SECTION
+            // + non-zero addend, clear the addend after redirection.
+            //
             // The addend of R_PPC_PLTREL24 should be ignored after changing to
             // R_PC.
-            if (config->emachine == EM_PPC && rel.type == R_PPC_PLTREL24)
+            if (config->emachine == EM_AARCH64 ||
+                (config->emachine == EM_PPC && rel.type == R_PPC_PLTREL24))
               rel.addend = 0;
           }
 
diff --git a/lld/ELF/Relocations.h b/lld/ELF/Relocations.h
index befe15b8f3b9b..060c55e3086d8 100644
--- a/lld/ELF/Relocations.h
+++ b/lld/ELF/Relocations.h
@@ -150,10 +150,17 @@ class ThunkCreator {
 
   bool normalizeExistingThunk(Relocation &rel, uint64_t src);
 
-  // Record all the available Thunks for a Symbol
-  llvm::DenseMap<std::pair<SectionBase *, uint64_t>, std::vector<Thunk *>>
-      thunkedSymbolsBySection;
-  llvm::DenseMap<Symbol *, std::vector<Thunk *>> thunkedSymbols;
+  // Record all the available Thunks for a (Symbol, addend) pair, where Symbol
+  // is represented as a (section, offset) pair. There may be multiple
+  // relocations sharing the same (section, offset + addend) pair. We may revert
+  // a relocation back to its original non-Thunk target, and restore the
+  // original addend, so we cannot fold offset + addend. A nested pair is used
+  // because DenseMapInfo is not specialized for std::tuple.
+  llvm::DenseMap<std::pair<std::pair<SectionBase *, uint64_t>, int64_t>,
+                 std::vector<Thunk *>>
+      thunkedSymbolsBySectionAndAddend;
+  llvm::DenseMap<std::pair<Symbol *, int64_t>, std::vector<Thunk *>>
+      thunkedSymbols;
 
   // Find a Thunk from the Thunks symbol definition, we can use this to find
   // the Thunk from a relocation to the Thunks symbol definition.
diff --git a/lld/ELF/SymbolTable.h b/lld/ELF/SymbolTable.h
index d3be0cb6450f9..507af8d2be75d 100644
--- a/lld/ELF/SymbolTable.h
+++ b/lld/ELF/SymbolTable.h
@@ -32,15 +32,19 @@ namespace elf {
 // add*() functions, which are called by input files as they are parsed. There
 // is one add* function per symbol type.
 class SymbolTable {
-public:
-  void wrap(Symbol *sym, Symbol *real, Symbol *wrap);
+  struct FilterOutPlaceholder {
+    bool operator()(Symbol *S) const { return !S->isPlaceholder(); }
+  };
+  using iterator = llvm::filter_iterator<std::vector<Symbol *>::const_iterator,
+                                         FilterOutPlaceholder>;
 
-  void forEachSymbol(llvm::function_ref<void(Symbol *)> fn) {
-    for (Symbol *sym : symVector)
-      if (!sym->isPlaceholder())
-        fn(sym);
+public:
+  llvm::iterator_range<iterator> symbols() const {
+    return llvm::make_filter_range(symVector, FilterOutPlaceholder());
   }
 
+  void wrap(Symbol *sym, Symbol *real, Symbol *wrap);
+
   Symbol *insert(StringRef name);
 
   Symbol *addSymbol(const Symbol &newSym);
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 40cc92faf7bb9..5bf7949dab53f 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -298,8 +298,8 @@ static size_t getHashSize() {
 // sets is empty, or some input files didn't have .note.gnu.property sections),
 // we don't create this section.
 GnuPropertySection::GnuPropertySection()
-    : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE, 4,
-                       ".note.gnu.property") {}
+    : SyntheticSection(llvm::ELF::SHF_ALLOC, llvm::ELF::SHT_NOTE,
+                       config->wordsize, ".note.gnu.property") {}
 
 void GnuPropertySection::writeTo(uint8_t *buf) {
   uint32_t featureAndType = config->emachine == EM_AARCH64
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 024e0cfec27b5..e1e99556ec7b0 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -130,7 +130,8 @@ int64_t TargetInfo::getImplicitAddend(const uint8_t *buf, RelType type) const {
 bool TargetInfo::usesOnlyLowPageBits(RelType type) const { return false; }
 
 bool TargetInfo::needsThunk(RelExpr expr, RelType type, const InputFile *file,
-                            uint64_t branchAddr, const Symbol &s) const {
+                            uint64_t branchAddr, const Symbol &s,
+                            int64_t a) const {
   return false;
 }
 
diff --git a/lld/ELF/Target.h b/lld/ELF/Target.h
index 39b999176717f..9d147ed7b1f30 100644
--- a/lld/ELF/Target.h
+++ b/lld/ELF/Target.h
@@ -58,7 +58,7 @@ class TargetInfo {
   // targeting S.
   virtual bool needsThunk(RelExpr expr, RelType relocType,
                           const InputFile *file, uint64_t branchAddr,
-                          const Symbol &s) const;
+                          const Symbol &s, int64_t a) const;
 
   // On systems with range extensions we place collections of Thunks at
   // regular spacings that enable the majority of branches reach the Thunks.
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index d135177860439..8d2cdba616a68 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -49,7 +49,7 @@ namespace {
 // AArch64 long range Thunks
 class AArch64ABSLongThunk final : public Thunk {
 public:
-  AArch64ABSLongThunk(Symbol &dest) : Thunk(dest) {}
+  AArch64ABSLongThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -57,7 +57,7 @@ class AArch64ABSLongThunk final : public Thunk {
 
 class AArch64ADRPThunk final : public Thunk {
 public:
-  AArch64ADRPThunk(Symbol &dest) : Thunk(dest) {}
+  AArch64ADRPThunk(Symbol &dest, int64_t addend) : Thunk(dest, addend) {}
   uint32_t size() override { return 12; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -73,7 +73,7 @@ class AArch64ADRPThunk final : public Thunk {
 // if the target is in range, otherwise it creates a long thunk.
 class ARMThunk : public Thunk {
 public:
-  ARMThunk(Symbol &dest) : Thunk(dest) {}
+  ARMThunk(Symbol &dest) : Thunk(dest, 0) {}
 
   bool getMayUseShortThunk();
   uint32_t size() override { return getMayUseShortThunk() ? 4 : sizeLong(); }
@@ -103,7 +103,7 @@ class ARMThunk : public Thunk {
 // which has a range of 16MB.
 class ThumbThunk : public Thunk {
 public:
-  ThumbThunk(Symbol &dest) : Thunk(dest) { alignment = 2; }
+  ThumbThunk(Symbol &dest) : Thunk(dest, 0) { alignment = 2; }
 
   bool getMayUseShortThunk();
   uint32_t size() override { return getMayUseShortThunk() ? 4 : sizeLong(); }
@@ -209,7 +209,7 @@ class ThumbV6MPILongThunk final : public ThumbThunk {
 // MIPS LA25 thunk
 class MipsThunk final : public Thunk {
 public:
-  MipsThunk(Symbol &dest) : Thunk(dest) {}
+  MipsThunk(Symbol &dest) : Thunk(dest, 0) {}
 
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
@@ -220,7 +220,7 @@ class MipsThunk final : public Thunk {
 // microMIPS R2-R5 LA25 thunk
 class MicroMipsThunk final : public Thunk {
 public:
-  MicroMipsThunk(Symbol &dest) : Thunk(dest) {}
+  MicroMipsThunk(Symbol &dest) : Thunk(dest, 0) {}
 
   uint32_t size() override { return 14; }
   void writeTo(uint8_t *buf) override;
@@ -231,7 +231,7 @@ class MicroMipsThunk final : public Thunk {
 // microMIPS R6 LA25 thunk
 class MicroMipsR6Thunk final : public Thunk {
 public:
-  MicroMipsR6Thunk(Symbol &dest) : Thunk(dest) {}
+  MicroMipsR6Thunk(Symbol &dest) : Thunk(dest, 0) {}
 
   uint32_t size() override { return 12; }
   void writeTo(uint8_t *buf) override;
@@ -241,8 +241,11 @@ class MicroMipsR6Thunk final : public Thunk {
 
 class PPC32PltCallStub final : public Thunk {
 public:
-  PPC32PltCallStub(const InputSection &isec, const Relocation &rel, Symbol &dest)
-      : Thunk(dest), addend(rel.type == R_PPC_PLTREL24 ? rel.addend : 0),
+  // For R_PPC_PLTREL24, Thunk::addend records the addend which will be used to
+  // decide the offsets in the call stub.
+  PPC32PltCallStub(const InputSection &isec, const Relocation &rel,
+                   Symbol &dest)
+      : Thunk(dest, rel.type == R_PPC_PLTREL24 ? rel.addend : 0),
         file(isec.file) {}
   uint32_t size() override { return 16; }
   void writeTo(uint8_t *buf) override;
@@ -250,10 +253,6 @@ class PPC32PltCallStub final : public Thunk {
   bool isCompatibleWith(const InputSection &isec, const Relocation &rel) const override;
 
 private:
-  // For R_PPC_PLTREL24, this records the addend, which will be used to decide
-  // the offsets in the call stub.
-  uint32_t addend;
-
   // Records the call site of the call stub.
   const InputFile *file;
 };
@@ -268,7 +267,7 @@ class PPC32PltCallStub final : public Thunk {
 // 3) Transferring control to the target function through an indirect branch.
 class PPC64PltCallStub final : public Thunk {
 public:
-  PPC64PltCallStub(Symbol &dest) : Thunk(dest) {}
+  PPC64PltCallStub(Symbol &dest) : Thunk(dest, 0) {}
   uint32_t size() override { return 20; }
   void writeTo(uint8_t *buf) override;
   void addSymbols(ThunkSection &isec) override;
@@ -289,7 +288,7 @@ class PPC64LongBranchThunk : public Thunk {
   void addSymbols(ThunkSection &isec) override;
 
 protected:
-  PPC64LongBranchThunk(Symbol &dest) : Thunk(dest) {}
+  PPC64LongBranchThunk(Symbol &dest) : Thunk(dest, 0) {}
 };
 
 class PPC64PILongBranchThunk final : public PPC64LongBranchThunk {
@@ -332,8 +331,8 @@ void Thunk::setOffset(uint64_t newOffset) {
 
 // AArch64 long range Thunks
 
-static uint64_t getAArch64ThunkDestVA(const Symbol &s) {
-  uint64_t v = s.isInPlt() ? s.getPltVA() : s.getVA();
+static uint64_t getAArch64ThunkDestVA(const Symbol &s, int64_t a) {
+  uint64_t v = s.isInPlt() ? s.getPltVA() : s.getVA(a);
   return v;
 }
 
@@ -344,7 +343,7 @@ void AArch64ABSLongThunk::writeTo(uint8_t *buf) {
     0x00, 0x00, 0x00, 0x00, // L0: .xword S
     0x00, 0x00, 0x00, 0x00,
   };
-  uint64_t s = getAArch64ThunkDestVA(destination);
+  uint64_t s = getAArch64ThunkDestVA(destination, addend);
   memcpy(buf, data, sizeof(data));
   target->relocateOne(buf + 8, R_AARCH64_ABS64, s);
 }
@@ -367,7 +366,7 @@ void AArch64ADRPThunk::writeTo(uint8_t *buf) {
       0x10, 0x02, 0x00, 0x91, // add  x16, x16, R_AARCH64_ADD_ABS_LO12_NC(Dest)
       0x00, 0x02, 0x1f, 0xd6, // br   x16
   };
-  uint64_t s = getAArch64ThunkDestVA(destination);
+  uint64_t s = getAArch64ThunkDestVA(destination, addend);
   uint64_t p = getThunkTargetSym()->getVA();
   memcpy(buf, data, sizeof(data));
   target->relocateOne(buf, R_AARCH64_ADR_PREL_PG_HI21,
@@ -795,16 +794,16 @@ void PPC64LongBranchThunk::addSymbols(ThunkSection &isec) {
             isec);
 }
 
-Thunk::Thunk(Symbol &d) : destination(d), offset(0) {}
+Thunk::Thunk(Symbol &d, int64_t a) : destination(d), addend(a), offset(0) {}
 
 Thunk::~Thunk() = default;
 
-static Thunk *addThunkAArch64(RelType type, Symbol &s) {
+static Thunk *addThunkAArch64(RelType type, Symbol &s, int64_t a) {
   if (type != R_AARCH64_CALL26 && type != R_AARCH64_JUMP26)
     fatal("unrecognized relocation type");
   if (config->picThunk)
-    return make<AArch64ADRPThunk>(s);
-  return make<AArch64ABSLongThunk>(s);
+    return make<AArch64ADRPThunk>(s, a);
+  return make<AArch64ABSLongThunk>(s, a);
 }
 
 // Creates a thunk for Thumb-ARM interworking.
@@ -895,7 +894,8 @@ static Thunk *addThunkMips(RelType type, Symbol &s) {
   return make<MipsThunk>(s);
 }
 
-static Thunk *addThunkPPC32(const InputSection &isec, const Relocation &rel, Symbol &s) {
+static Thunk *addThunkPPC32(const InputSection &isec, const Relocation &rel,
+                            Symbol &s) {
   assert((rel.type == R_PPC_REL24 || rel.type == R_PPC_PLTREL24) &&
          "unexpected relocation type for thunk");
   return make<PPC32PltCallStub>(isec, rel, s);
@@ -914,9 +914,10 @@ static Thunk *addThunkPPC64(RelType type, Symbol &s) {
 
 Thunk *addThunk(const InputSection &isec, Relocation &rel) {
   Symbol &s = *rel.sym;
+  int64_t a = rel.addend;
 
   if (config->emachine == EM_AARCH64)
-    return addThunkAArch64(rel.type, s);
+    return addThunkAArch64(rel.type, s, a);
 
   if (config->emachine == EM_ARM)
     return addThunkArm(rel.type, s);
diff --git a/lld/ELF/Thunks.h b/lld/ELF/Thunks.h
index 2d27ee5f6c38e..891bf8e5e4348 100644
--- a/lld/ELF/Thunks.h
+++ b/lld/ELF/Thunks.h
@@ -27,7 +27,7 @@ class ThunkSection;
 // Thunks are assigned to synthetic ThunkSections
 class Thunk {
 public:
-  Thunk(Symbol &destination);
+  Thunk(Symbol &destination, int64_t addend);
   virtual ~Thunk();
 
   virtual uint32_t size() = 0;
@@ -55,11 +55,12 @@ class Thunk {
 
   Defined *getThunkTargetSym() const { return syms[0]; }
 
-  // The alignment requirement for this Thunk, defaults to the size of the
-  // typical code section alignment.
   Symbol &destination;
+  int64_t addend;
   llvm::SmallVector<Defined *, 3> syms;
   uint64_t offset = 0;
+  // The alignment requirement for this Thunk, defaults to the size of the
+  // typical code section alignment.
   uint32_t alignment = 4;
 };
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 3de1230150d64..ab59d0365085a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -1238,10 +1238,9 @@ static DenseMap<const InputSectionBase *, int> buildSectionOrder() {
 
   // We want both global and local symbols. We get the global ones from the
   // symbol table and iterate the object files for the local ones.
-  symtab->forEachSymbol([&](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols())
     if (!sym->isLazy())
       addSym(*sym);
-  });
 
   for (InputFile *file : objectFiles)
     for (Symbol *sym : file->getSymbols())
@@ -1734,8 +1733,8 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   for (Partition &part : partitions)
     finalizeSynthetic(part.ehFrame);
 
-  symtab->forEachSymbol(
-      [](Symbol *s) { s->isPreemptible = computeIsPreemptible(*s); });
+  for (Symbol *sym : symtab->symbols())
+    sym->isPreemptible = computeIsPreemptible(*sym);
 
   // Change values of linker-script-defined symbols from placeholders (assigned
   // by declareSymbols) to actual definitions.
@@ -1769,19 +1768,18 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
             return symtab->soNames.count(needed);
           });
 
-    symtab->forEachSymbol([](Symbol *sym) {
+    for (Symbol *sym : symtab->symbols())
       if (sym->isUndefined() && !sym->isWeak())
         if (auto *f = dyn_cast_or_null<SharedFile>(sym->file))
           if (f->allNeededIsKnown)
             error(toString(f) + ": undefined reference to " + toString(*sym));
-    });
   }
 
   // Now that we have defined all possible global symbols including linker-
   // synthesized ones. Visit all symbols to give the finishing touches.
-  symtab->forEachSymbol([](Symbol *sym) {
+  for (Symbol *sym : symtab->symbols()) {
     if (!includeInSymtab(*sym))
-      return;
+      continue;
     if (in.symTab)
       in.symTab->addSymbol(sym);
 
@@ -1791,7 +1789,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
         if (file->isNeeded && !sym->isUndefined())
           addVerneed(sym);
     }
-  });
+  }
 
   // We also need to scan the dynamic relocation tables of the other partitions
   // and add any referenced symbols to the partition's dynsym.
@@ -2190,6 +2188,9 @@ std::vector<PhdrEntry *> Writer<ELFT>::createPhdrs(Partition &part) {
   if (config->zWxneeded)
     addHdr(PT_OPENBSD_WXNEEDED, PF_X);
 
+  if (OutputSection *cmd = findSection(".note.gnu.property", partNo))
+    addHdr(PT_GNU_PROPERTY, PF_R)->add(cmd);
+
   // Create one PT_NOTE per a group of contiguous SHT_NOTE sections with the
   // same alignment.
   PhdrEntry *note = nullptr;
diff --git a/lld/test/ELF/aarch64-feature-bti.s b/lld/test/ELF/aarch64-feature-bti.s
index f2889c6fcc92b..0fa1cf83727bd 100644
--- a/lld/test/ELF/aarch64-feature-bti.s
+++ b/lld/test/ELF/aarch64-feature-bti.s
@@ -55,28 +55,30 @@
 # BTIDYN:      0x0000000070000001 (AARCH64_BTI_PLT)
 # BTIDYN-NOT:  0x0000000070000003 (AARCH64_PAC_PLT)
 
-# BTISO: 0000000000010310 func2:
-# BTISO-NEXT:    10310: bl      #48 <func3@plt>
-# BTISO-NEXT:    10314: ret
+# BTISO: 0000000000010348 func2:
+# BTISO-NEXT:    10348: bl      #56 <func3@plt>
+# BTISO-NEXT:           ret
+# BTISO: 0000000000010350 func3:
+# BTISO-NEXT:    10350: ret
 # BTISO: Disassembly of section .plt:
-# BTISO: 0000000000010320 .plt:
-# BTISO-NEXT:    10320: bti     c
-# BTISO-NEXT:    10324: stp     x16, x30, [sp, #-16]!
-# BTISO-NEXT:    10328: adrp    x16, #131072
-# BTISO-NEXT:    1032c: ldr     x17, [x16, #1072]
-# BTISO-NEXT:    10330: add     x16, x16, #1072
-# BTISO-NEXT:    10334: br      x17
-# BTISO-NEXT:    10338: nop
-# BTISO-NEXT:    1033c: nop
-# BTISO: 0000000000010340 func3@plt:
-# BTISO-NEXT:    10340: adrp    x16, #131072
-# BTISO-NEXT:    10344: ldr     x17, [x16, #1080]
-# BTISO-NEXT:    10348: add     x16, x16, #1080
-# BTISO-NEXT:    1034c: br      x17
+# BTISO: 0000000000010360 .plt:
+# BTISO-NEXT:    10360: bti     c
+# BTISO-NEXT:           stp     x16, x30, [sp, #-16]!
+# BTISO-NEXT:           adrp    x16, #131072
+# BTISO-NEXT:           ldr     x17, [x16, #1136]
+# BTISO-NEXT:           add     x16, x16, #1136
+# BTISO-NEXT:           br      x17
+# BTISO-NEXT:           nop
+# BTISO-NEXT:           nop
+# BTISO: 0000000000010380 func3@plt:
+# BTISO-NEXT:    10380: adrp    x16, #131072
+# BTISO-NEXT:           ldr     x17, [x16, #1144]
+# BTISO-NEXT:           add     x16, x16, #1144
+# BTISO-NEXT:           br      x17
 
 # SOGOTPLT2: Hex dump of section '.got.plt'
-# SOGOTPLT2-NEXT:  0x00030420 00000000 00000000 00000000 00000000
-# SOGOTPLT2-NEXT:  0x00030430 00000000 00000000 20030100 00000000
+# SOGOTPLT2-NEXT:  0x00030460 00000000 00000000 00000000 00000000
+# SOGOTPLT2-NEXT:  0x00030470 00000000 00000000 60030100 00000000
 
 ## Build an executable with all relocatable inputs having the BTI
 ## .note.gnu.property. We expect a bti c in front of all PLT entries as the
@@ -89,26 +91,26 @@
 # RUN: llvm-objdump -d -mattr=+bti --no-show-raw-insn %t.exe | FileCheck --check-prefix=EXECBTI %s
 
 # EXECBTI: Disassembly of section .text:
-# EXECBTI: 0000000000210310 func1:
-# EXECBTI-NEXT:   210310: bl      #48 <func2@plt>
-# EXECBTI-NEXT:   210314: ret
+# EXECBTI: 0000000000210348 func1:
+# EXECBTI-NEXT:   210348: bl    #40 <func2@plt>
+# EXECBTI-NEXT:           ret
 # EXECBTI: Disassembly of section .plt:
-# EXECBTI: 0000000000210320 .plt:
-# EXECBTI-NEXT:   210320: bti     c
-# EXECBTI-NEXT:   210324: stp     x16, x30, [sp, #-16]!
-# EXECBTI-NEXT:   210328: adrp    x16, #131072
-# EXECBTI-NEXT:   21032c: ldr     x17, [x16, #1112]
-# EXECBTI-NEXT:   210330: add     x16, x16, #1112
-# EXECBTI-NEXT:   210334: br      x17
-# EXECBTI-NEXT:   210338: nop
-# EXECBTI-NEXT:   21033c: nop
-# EXECBTI: 0000000000210340 func2@plt:
-# EXECBTI-NEXT:   210340: bti     c
-# EXECBTI-NEXT:   210344: adrp    x16, #131072
-# EXECBTI-NEXT:   210348: ldr     x17, [x16, #1120]
-# EXECBTI-NEXT:   21034c: add     x16, x16, #1120
-# EXECBTI-NEXT:   210350: br      x17
-# EXECBTI-NEXT:   210354: nop
+# EXECBTI: 0000000000210350 .plt:
+# EXECBTI-NEXT:   210350: bti   c
+# EXECBTI-NEXT:           stp   x16, x30, [sp, #-16]!
+# EXECBTI-NEXT:           adrp  x16, #131072
+# EXECBTI-NEXT:           ldr   x17, [x16, #1160]
+# EXECBTI-NEXT:           add   x16, x16, #1160
+# EXECBTI-NEXT:           br    x17
+# EXECBTI-NEXT:           nop
+# EXECBTI-NEXT:           nop
+# EXECBTI: 0000000000210370 func2@plt:
+# EXECBTI-NEXT:   210370: bti   c
+# EXECBTI-NEXT:           adrp  x16, #131072
+# EXECBTI-NEXT:           ldr   x17, [x16, #1168]
+# EXECBTI-NEXT:           add   x16, x16, #1168
+# EXECBTI-NEXT:           br    x17
+# EXECBTI-NEXT:           nop
 
 ## We expect the same for PIE, as the address of an ifunc can escape
 # RUN: ld.lld --pie %t.o %t.so %t2.so -o %tpie.exe
@@ -117,26 +119,26 @@
 # RUN: llvm-objdump -d -mattr=+bti --no-show-raw-insn %tpie.exe | FileCheck --check-prefix=PIE %s
 
 # PIE: Disassembly of section .text:
-# PIE: 0000000000010310 func1:
-# PIE-NEXT:    10310: bl      #48 <func2@plt>
-# PIE-NEXT:    10314: ret
+# PIE: 0000000000010348 func1:
+# PIE-NEXT:    10348: bl     #40 <func2@plt>
+# PIE-NEXT:           ret
 # PIE: Disassembly of section .plt:
-# PIE: 0000000000010320 .plt:
-# PIE-NEXT:    10320: bti     c
-# PIE-NEXT:    10324: stp     x16, x30, [sp, #-16]!
-# PIE-NEXT:    10328: adrp    x16, #131072
-# PIE-NEXT:    1032c: ldr     x17, [x16, #1112]
-# PIE-NEXT:    10330: add     x16, x16, #1112
-# PIE-NEXT:    10334: br      x17
-# PIE-NEXT:    10338: nop
-# PIE-NEXT:    1033c: nop
-# PIE: 0000000000010340 func2@plt:
-# PIE-NEXT:    10340: bti     c
-# PIE-NEXT:    10344: adrp    x16, #131072
-# PIE-NEXT:    10348: ldr     x17, [x16, #1120]
-# PIE-NEXT:    1034c: add     x16, x16, #1120
-# PIE-NEXT:    10350: br      x17
-# PIE-NEXT:    10354: nop
+# PIE: 0000000000010350 .plt:
+# PIE-NEXT:    10350: bti    c
+# PIE-NEXT:           stp    x16, x30, [sp, #-16]!
+# PIE-NEXT:           adrp   x16, #131072
+# PIE-NEXT:           ldr    x17, [x16, #1160]
+# PIE-NEXT:           add    x16, x16, #1160
+# PIE-NEXT:           br     x17
+# PIE-NEXT:           nop
+# PIE-NEXT:           nop
+# PIE: 0000000000010370 func2@plt:
+# PIE-NEXT:    10370: bti    c
+# PIE-NEXT:           adrp   x16, #131072
+# PIE-NEXT:           ldr    x17, [x16, #1168]
+# PIE-NEXT:           add    x16, x16, #1168
+# PIE-NEXT:           br     x17
+# PIE-NEXT:           nop
 
 ## Build and executable with not all relocatable inputs having the BTI
 ## .note.property, expect no bti c and no .note.gnu.property entry
@@ -148,24 +150,24 @@
 # NOEX: Disassembly of section .text:
 # NOEX: 00000000002102e0 func1:
 # NOEX-NEXT:   2102e0: bl      #48 <func2@plt>
-# NOEX-NEXT:   2102e4: ret
+# NOEX-NEXT:           ret
 # NOEX: 00000000002102e8 func3:
 # NOEX-NEXT:   2102e8: ret
 # NOEX: Disassembly of section .plt:
 # NOEX: 00000000002102f0 .plt:
 # NOEX-NEXT:   2102f0: stp     x16, x30, [sp, #-16]!
-# NOEX-NEXT:   2102f4: adrp    x16, #131072
-# NOEX-NEXT:   2102f8: ldr     x17, [x16, #1024]
-# NOEX-NEXT:   2102fc: add     x16, x16, #1024
-# NOEX-NEXT:   210300: br      x17
-# NOEX-NEXT:   210304: nop
-# NOEX-NEXT:   210308: nop
-# NOEX-NEXT:   21030c: nop
+# NOEX-NEXT:           adrp    x16, #131072
+# NOEX-NEXT:           ldr     x17, [x16, #1024]
+# NOEX-NEXT:           add     x16, x16, #1024
+# NOEX-NEXT:           br      x17
+# NOEX-NEXT:           nop
+# NOEX-NEXT:           nop
+# NOEX-NEXT:           nop
 # NOEX: 0000000000210310 func2@plt:
 # NOEX-NEXT:   210310: adrp    x16, #131072
-# NOEX-NEXT:   210314: ldr     x17, [x16, #1032]
-# NOEX-NEXT:   210318: add     x16, x16, #1032
-# NOEX-NEXT:   21031c: br      x17
+# NOEX-NEXT:           ldr     x17, [x16, #1032]
+# NOEX-NEXT:           add     x16, x16, #1032
+# NOEX-NEXT:           br      x17
 
 ## Force BTI entries with the --force-bti command line option. Expect a warning
 ## from the file without the .note.gnu.property.
@@ -180,28 +182,28 @@
 # RUN: llvm-objdump -d -mattr=+bti --no-show-raw-insn %tforcebti.exe | FileCheck --check-prefix=FORCE %s
 
 # FORCE: Disassembly of section .text:
-# FORCE: 0000000000210338 func1:
-# FORCE-NEXT:   210338: bl      #56 <func2@plt>
-# FORCE-NEXT:   21033c: ret
-# FORCE: 0000000000210340 func3:
-# FORCE-NEXT:   210340: ret
+# FORCE: 0000000000210370 func1:
+# FORCE-NEXT:   210370: bl      #48 <func2@plt>
+# FORCE-NEXT:           ret
+# FORCE: 0000000000210378 func3:
+# FORCE-NEXT:   210378: ret
 # FORCE: Disassembly of section .plt:
-# FORCE: 0000000000210350 .plt:
-# FORCE-NEXT:   210350: bti     c
-# FORCE-NEXT:   210354: stp     x16, x30, [sp, #-16]!
-# FORCE-NEXT:   210358: adrp    x16, #131072
-# FORCE-NEXT:   21035c: ldr     x17, [x16, #1144]
-# FORCE-NEXT:   210360: add     x16, x16, #1144
-# FORCE-NEXT:   210364: br      x17
-# FORCE-NEXT:   210368: nop
-# FORCE-NEXT:   21036c: nop
-# FORCE: 0000000000210370 func2@plt:
-# FORCE-NEXT:   210370: bti     c
-# FORCE-NEXT:   210374: adrp    x16, #131072
-# FORCE-NEXT:   210378: ldr     x17, [x16, #1152]
-# FORCE-NEXT:   21037c: add     x16, x16, #1152
-# FORCE-NEXT:   210380: br      x17
-# FORCE-NEXT:   210384: nop
+# FORCE: 0000000000210380 .plt:
+# FORCE-NEXT:   210380: bti     c
+# FORCE-NEXT:           stp     x16, x30, [sp, #-16]!
+# FORCE-NEXT:           adrp    x16, #131072
+# FORCE-NEXT:           ldr     x17, [x16, #1192]
+# FORCE-NEXT:           add     x16, x16, #1192
+# FORCE-NEXT:           br      x17
+# FORCE-NEXT:           nop
+# FORCE-NEXT:           nop
+# FORCE: 00000000002103a0 func2@plt:
+# FORCE-NEXT:   2103a0: bti     c
+# FORCE-NEXT:           adrp    x16, #131072
+# FORCE-NEXT:           ldr     x17, [x16, #1200]
+# FORCE-NEXT:           add     x16, x16, #1200
+# FORCE-NEXT:           br      x17
+# FORCE-NEXT:           nop
 
 .section ".note.gnu.property", "a"
 .long 4
diff --git a/lld/test/ELF/aarch64-feature-btipac.s b/lld/test/ELF/aarch64-feature-btipac.s
index c1fa4c1d3b289..30e00b2dbbd81 100644
--- a/lld/test/ELF/aarch64-feature-btipac.s
+++ b/lld/test/ELF/aarch64-feature-btipac.s
@@ -15,28 +15,28 @@
 # RUN: llvm-readelf --dynamic-table %t.so | FileCheck --check-prefix BTIPACDYN %s
 
 # BTIPACSO: Disassembly of section .text:
-# BTIPACSO: 0000000000010310 func2:
-# BTIPACSO-NEXT:    10310: bl      #48 <func3@plt>
-# BTIPACSO-NEXT:    10314: ret
-# BTIPACSO: 0000000000010318 func3:
-# BTIPACSO-NEXT:    10318: ret
+# BTIPACSO: 0000000000010348 func2:
+# BTIPACSO-NEXT:    10348:              bl      #56 <func3@plt>
+# BTIPACSO-NEXT:                        ret
+# BTIPACSO: 0000000000010350 func3:
+# BTIPACSO-NEXT:    10350:              ret
 # BTIPACSO: Disassembly of section .plt:
-# BTIPACSO: 0000000000010320 .plt:
-# BTIPACSO-NEXT:    10320: bti     c
-# BTIPACSO-NEXT:    10324: stp     x16, x30, [sp, #-16]!
-# BTIPACSO-NEXT:    10328: adrp    x16, #131072
-# BTIPACSO-NEXT:    1032c: ldr     x17, [x16, #1096]
-# BTIPACSO-NEXT:    10330: add     x16, x16, #1096
-# BTIPACSO-NEXT:    10334: br      x17
-# BTIPACSO-NEXT:    10338: nop
-# BTIPACSO-NEXT:    1033c: nop
-# BTIPACSO: 0000000000010340 func3@plt:
-# BTIPACSO-NEXT:    10340: adrp    x16, #131072
-# BTIPACSO-NEXT:    10344: ldr     x17, [x16, #1104]
-# BTIPACSO-NEXT:    10348: add     x16, x16, #1104
-# BTIPACSO-NEXT:    1034c: autia1716
-# BTIPACSO-NEXT:    10350: br      x17
-# BTIPACSO-NEXT:    10354: nop
+# BTIPACSO: 0000000000010360 .plt:
+# BTIPACSO-NEXT:    10360:              bti     c
+# BTIPACSO-NEXT:                        stp     x16, x30, [sp, #-16]!
+# BTIPACSO-NEXT:                        adrp    x16, #131072
+# BTIPACSO-NEXT:                        ldr     x17, [x16, #1160]
+# BTIPACSO-NEXT:                        add     x16, x16, #1160
+# BTIPACSO-NEXT:                        br      x17
+# BTIPACSO-NEXT:                        nop
+# BTIPACSO-NEXT:                        nop
+# BTIPACSO: 0000000000010380 func3@plt:
+# BTIPACSO-NEXT:    10380:              adrp    x16, #131072
+# BTIPACSO-NEXT:                        ldr     x17, [x16, #1168]
+# BTIPACSO-NEXT:                        add     x16, x16, #1168
+# BTIPACSO-NEXT:                        autia1716
+# BTIPACSO-NEXT:                        br      x17
+# BTIPACSO-NEXT:                        nop
 
 # BTIPACPROP:    Properties:    aarch64 feature: BTI, PAC
 
@@ -53,29 +53,29 @@
 # RUN: llvm-readelf --dynamic-table %t.exe | FileCheck --check-prefix BTIPACDYN %s
 
 # BTIPACEX: Disassembly of section .text:
-# BTIPACEX: 0000000000210338 func1:
-# BTIPACEX-NEXT:   210338: bl      #56 <func2@plt>
-# BTIPACEX-NEXT:   21033c: ret
-# BTIPACEX-NEXT:   210340: ret
-# BTIPACEX: 0000000000210344 func3:
-# BTIPACEX-NEXT:   210344: ret
+# BTIPACEX: 0000000000210370 func1:
+# BTIPACEX-NEXT:   210370:              bl      #48 <func2@plt>
+# BTIPACEX-NEXT:                        ret
+# BTIPACEX-NEXT:                        ret
+# BTIPACEX: 000000000021037c func3:
+# BTIPACEX-NEXT:   21037c:              ret
 # BTIPACEX: Disassembly of section .plt:
-# BTIPACEX: 0000000000210350 .plt:
-# BTIPACEX-NEXT:   210350: bti     c
-# BTIPACEX-NEXT:   210354: stp     x16, x30, [sp, #-16]!
-# BTIPACEX-NEXT:   210358: adrp    x16, #131072
-# BTIPACEX-NEXT:   21035c: ldr     x17, [x16, #1160]
-# BTIPACEX-NEXT:   210360: add     x16, x16, #1160
-# BTIPACEX-NEXT:   210364: br      x17
-# BTIPACEX-NEXT:   210368: nop
-# BTIPACEX-NEXT:   21036c: nop
-# BTIPACEX: 0000000000210370 func2@plt:
-# BTIPACEX-NEXT:   210370: bti     c
-# BTIPACEX-NEXT:   210374: adrp    x16, #131072
-# BTIPACEX-NEXT:   210378: ldr     x17, [x16, #1168]
-# BTIPACEX-NEXT:   21037c: add     x16, x16, #1168
-# BTIPACEX-NEXT:   210380: autia1716
-# BTIPACEX-NEXT:   210384: br      x17
+# BTIPACEX: 0000000000210380 .plt:
+# BTIPACEX-NEXT:   210380:              bti     c
+# BTIPACEX-NEXT:                        stp     x16, x30, [sp, #-16]!
+# BTIPACEX-NEXT:                        adrp    x16, #131072
+# BTIPACEX-NEXT:                        ldr     x17, [x16, #1208]
+# BTIPACEX-NEXT:                        add     x16, x16, #1208
+# BTIPACEX-NEXT:                        br      x17
+# BTIPACEX-NEXT:                        nop
+# BTIPACEX-NEXT:                        nop
+# BTIPACEX: 00000000002103a0 func2@plt:
+# BTIPACEX-NEXT:   2103a0:              bti     c
+# BTIPACEX-NEXT:                        adrp    x16, #131072
+# BTIPACEX-NEXT:                        ldr     x17, [x16, #1216]
+# BTIPACEX-NEXT:                        add     x16, x16, #1216
+# BTIPACEX-NEXT:                        autia1716
+# BTIPACEX-NEXT:                        br      x17
 
 ## Check that combinations of BTI+PAC with 0 properties results in standard PLT
 
@@ -86,25 +86,25 @@
 # EX: Disassembly of section .text:
 # EX: 00000000002102e0 func1:
 # EX-NEXT:   2102e0: bl      #48 <func2@plt>
-# EX-NEXT:   2102e4: ret
-# EX-NEXT:   2102e8: ret
+# EX-NEXT:           ret
+# EX-NEXT:           ret
 # EX: 00000000002102ec func3:
 # EX-NEXT:   2102ec: ret
 # EX: Disassembly of section .plt:
 # EX: 00000000002102f0 .plt:
 # EX-NEXT:   2102f0: stp     x16, x30, [sp, #-16]!
-# EX-NEXT:   2102f4: adrp    x16, #131072
-# EX-NEXT:   2102f8: ldr     x17, [x16, #1024]
-# EX-NEXT:   2102fc: add     x16, x16, #1024
-# EX-NEXT:   210300: br      x17
-# EX-NEXT:   210304: nop
-# EX-NEXT:   210308: nop
-# EX-NEXT:   21030c: nop
+# EX-NEXT:           adrp    x16, #131072
+# EX-NEXT:           ldr     x17, [x16, #1024]
+# EX-NEXT:           add     x16, x16, #1024
+# EX-NEXT:           br      x17
+# EX-NEXT:           nop
+# EX-NEXT:           nop
+# EX-NEXT:           nop
 # EX: 0000000000210310 func2@plt:
 # EX:        210310: adrp    x16, #131072
-# EX-NEXT:   210314: ldr     x17, [x16, #1032]
-# EX-NEXT:   210318: add     x16, x16, #1032
-# EX-NEXT:   21031c: br      x17
+# EX-NEXT:           ldr     x17, [x16, #1032]
+# EX-NEXT:           add     x16, x16, #1032
+# EX-NEXT:           br      x17
 
 # NODYN-NOT:   0x0000000070000001 (AARCH64_BTI_PLT)
 # NODYN-NOT:   0x0000000070000003 (AARCH64_PAC_PLT)
diff --git a/lld/test/ELF/aarch64-feature-pac.s b/lld/test/ELF/aarch64-feature-pac.s
index cb0bcee70a8a8..7a4f8ee64ffdb 100644
--- a/lld/test/ELF/aarch64-feature-pac.s
+++ b/lld/test/ELF/aarch64-feature-pac.s
@@ -15,22 +15,22 @@
 
 # NOPAC: 00000000000102b8 func2:
 # NOPAC-NEXT:    102b8: bl      #56 <func3@plt>
-# NOPAC-NEXT:    102bc: ret
+# NOPAC-NEXT:           ret
 # NOPAC: Disassembly of section .plt:
 # NOPAC: 00000000000102d0 .plt:
 # NOPAC-NEXT:    102d0: stp     x16, x30, [sp, #-16]!
-# NOPAC-NEXT:    102d4: adrp    x16, #131072
-# NOPAC-NEXT:    102d8: ldr     x17, [x16, #960]
-# NOPAC-NEXT:    102dc: add     x16, x16, #960
-# NOPAC-NEXT:    102e0: br      x17
-# NOPAC-NEXT:    102e4: nop
-# NOPAC-NEXT:    102e8: nop
-# NOPAC-NEXT:    102ec: nop
+# NOPAC-NEXT:           adrp    x16, #131072
+# NOPAC-NEXT:           ldr     x17, [x16, #960]
+# NOPAC-NEXT:           add     x16, x16, #960
+# NOPAC-NEXT:           br      x17
+# NOPAC-NEXT:           nop
+# NOPAC-NEXT:           nop
+# NOPAC-NEXT:           nop
 # NOPAC: 00000000000102f0 func3@plt:
 # NOPAC-NEXT:    102f0: adrp    x16, #131072
-# NOPAC-NEXT:    102f4: ldr     x17, [x16, #968]
-# NOPAC-NEXT:    102f8: add     x16, x16, #968
-# NOPAC-NEXT:    102fc: br      x17
+# NOPAC-NEXT:           ldr     x17, [x16, #968]
+# NOPAC-NEXT:           add     x16, x16, #968
+# NOPAC-NEXT:           br      x17
 
 # NOPACDYN-NOT:   0x0000000070000001 (AARCH64_BTI_PLT)
 # NOPACDYN-NOT:   0x0000000070000003 (AARCH64_PAC_PLT)
@@ -44,34 +44,36 @@
 ## PAC has no effect on PLT[0], for PLT[N] autia1716 is used to authenticate
 ## the address in x17 (context in x16) before branching to it. The dynamic
 ## loader is responsible for calling pacia1716 on the entry.
-# PACSO: 0000000000010310 func2:
-# PACSO-NEXT:    10310: bl      #48 <func3@plt>
-# PACSO-NEXT:    10314: ret
+# PACSO: 0000000000010348 func2:
+# PACSO-NEXT:    10348:         bl      #56 <func3@plt>
+# PACSO-NEXT:                   ret
+# PACSO: 0000000000010350 func3:
+# PACSO-NEXT:    10350:         ret
 # PACSO: Disassembly of section .plt:
-# PACSO: 0000000000010320 .plt:
-# PACSO-NEXT:    10320: stp     x16, x30, [sp, #-16]!
-# PACSO-NEXT:    10324: adrp    x16, #131072
-# PACSO-NEXT:    10328: ldr     x17, [x16, #1080]
-# PACSO-NEXT:    1032c: add     x16, x16, #1080
-# PACSO-NEXT:    10330: br      x17
-# PACSO-NEXT:    10334: nop
-# PACSO-NEXT:    10338: nop
-# PACSO-NEXT:    1033c: nop
-# PACSO: 0000000000010340 func3@plt:
-# PACSO-NEXT:    10340: adrp    x16, #131072
-# PACSO-NEXT:    10344: ldr     x17, [x16, #1088]
-# PACSO-NEXT:    10348: add     x16, x16, #1088
-# PACSO-NEXT:    1034c: autia1716
-# PACSO-NEXT:    10350: br      x17
-# PACSO-NEXT:    10354: nop
+# PACSO: 0000000000010360 .plt:
+# PACSO-NEXT:    10360:         stp     x16, x30, [sp, #-16]!
+# PACSO-NEXT:                   adrp    x16, #131072
+# PACSO-NEXT:                   ldr     x17, [x16, #1144]
+# PACSO-NEXT:                   add     x16, x16, #1144
+# PACSO-NEXT:                   br      x17
+# PACSO-NEXT:                   nop
+# PACSO-NEXT:                   nop
+# PACSO-NEXT:                   nop
+# PACSO: 0000000000010380 func3@plt:
+# PACSO-NEXT:    10380:         adrp    x16, #131072
+# PACSO-NEXT:                   ldr     x17, [x16, #1152]
+# PACSO-NEXT:                   add     x16, x16, #1152
+# PACSO-NEXT:                   autia1716
+# PACSO-NEXT:                   br      x17
+# PACSO-NEXT:                   nop
 
 # SOGOTPLT: Hex dump of section '.got.plt':
 # SOGOTPLT-NEXT: 0x000303b0 00000000 00000000 00000000 00000000
 # SOGOTPLT-NEXT: 0x000303c0 00000000 00000000 d0020100 00000000
 
 # SOGOTPLT2: Hex dump of section '.got.plt':
-# SOGOTPLT2-NEXT: 0x00030428 00000000 00000000 00000000 00000000
-# SOGOTPLT2-NEXT: 0x00030438 00000000 00000000 20030100 00000000
+# SOGOTPLT2-NEXT: 0x00030468 00000000 00000000 00000000 00000000
+# SOGOTPLT2-NEXT: 0x00030478 00000000 00000000 60030100 00000000
 
 # PACPROP: Properties:    aarch64 feature: PAC
 
@@ -89,29 +91,28 @@
 # RUN: llvm-objdump -d -mattr=+v8.3a --no-show-raw-insn %tpacplt.exe | FileCheck --check-prefix PACPLT %s
 
 # PACPLT: Disassembly of section .text:
-# PACPLT: 0000000000210338 func1:
-# PACPLT-NEXT:   210338: bl      #56 <func2@plt>
-# PACPLT-NEXT:   21033c: ret
-# PACPLT: 0000000000210340 func3:
-# PACPLT-NEXT:   210340: ret
+# PACPLT: 0000000000210370 func1:
+# PACPLT-NEXT:   210370:        bl      #48 <func2@plt>
+# PACPLT-NEXT:                  ret
+# PACPLT: 0000000000210378 func3:
+# PACPLT-NEXT:   210378:        ret
 # PACPLT: Disassembly of section .plt:
-# PACPLT: 0000000000210350 .plt:
-# PACPLT-NEXT:   210350: stp     x16, x30, [sp, #-16]!
-# PACPLT-NEXT:   210354: adrp    x16, #131072
-# PACPLT-NEXT:   210358: ldr     x17, [x16, #1144]
-# PACPLT-NEXT:   21035c: add     x16, x16, #1144
-# PACPLT-NEXT:   210360: br      x17
-# PACPLT-NEXT:   210364: nop
-# PACPLT-NEXT:   210368: nop
-# PACPLT-NEXT:   21036c: nop
-# PACPLT: 0000000000210370 func2@plt:
-# PACPLT-NEXT:   210370: adrp    x16, #131072
-# PACPLT-NEXT:   210374: ldr     x17, [x16, #1152]
-# PACPLT-NEXT:   210378: add     x16, x16, #1152
-# PACPLT-NEXT:   21037c: autia1716
-# PACPLT-NEXT:   210380: br      x17
-# PACPLT-NEXT:   210384: nop
-
+# PACPLT: 0000000000210380 .plt:
+# PACPLT-NEXT:   210380:        stp     x16, x30, [sp, #-16]!
+# PACPLT-NEXT:                  adrp    x16, #131072
+# PACPLT-NEXT:                  ldr     x17, [x16, #1192]
+# PACPLT-NEXT:                  add     x16, x16, #1192
+# PACPLT-NEXT:                  br      x17
+# PACPLT-NEXT:                  nop
+# PACPLT-NEXT:                  nop
+# PACPLT-NEXT:                  nop
+# PACPLT: 00000000002103a0 func2@plt:
+# PACPLT-NEXT:   2103a0:        adrp    x16, #131072
+# PACPLT-NEXT:                  ldr     x17, [x16, #1200]
+# PACPLT-NEXT:                  add     x16, x16, #1200
+# PACPLT-NEXT:                  autia1716
+# PACPLT-NEXT:                  br      x17
+# PACPLT-NEXT:                  nop
 
 .section ".note.gnu.property", "a"
 .long 4
diff --git a/lld/test/ELF/aarch64-ifunc-bti.s b/lld/test/ELF/aarch64-ifunc-bti.s
index 6a50b317ca3d2..70369d3e9f818 100644
--- a/lld/test/ELF/aarch64-ifunc-bti.s
+++ b/lld/test/ELF/aarch64-ifunc-bti.s
@@ -4,37 +4,37 @@
 
 # RUN: ld.lld --shared --soname=t1.so %t1.o -o %t1.so
 # RUN: ld.lld --pie %t1.so %t.o -o %t
-# RUN: llvm-objdump -d -mattr=+bti -triple=aarch64-linux-gnu %t | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn -mattr=+bti -triple=aarch64-linux-gnu %t | FileCheck %s
 
 # When the address of an ifunc is taken using a non-got reference which clang
 # can do, LLD exports a canonical PLT entry that may have its address taken so
 # we must use bti c.
 
 # CHECK: Disassembly of section .plt:
-# CHECK: 0000000000010340 .plt:
-# CHECK-NEXT:    10340: 5f 24 03 d5                     bti     c
-# CHECK-NEXT:    10344: f0 7b bf a9                     stp     x16, x30, [sp, #-16]!
-# CHECK-NEXT:    10348: 10 01 00 90                     adrp    x16, #131072
-# CHECK-NEXT:    1034c: 11 5e 42 f9                     ldr     x17, [x16, #1208]
-# CHECK-NEXT:    10350: 10 e2 12 91                     add     x16, x16, #1208
-# CHECK-NEXT:    10354: 20 02 1f d6                     br      x17
-# CHECK-NEXT:    10358: 1f 20 03 d5                     nop
-# CHECK-NEXT:    1035c: 1f 20 03 d5                     nop
-# CHECK: 0000000000010360 func1@plt:
-# CHECK-NEXT:    10360: 5f 24 03 d5                     bti     c
-# CHECK-NEXT:    10364: 10 01 00 90                     adrp    x16, #131072
-# CHECK-NEXT:    10368: 11 62 42 f9                     ldr     x17, [x16, #1216]
-# CHECK-NEXT:    1036c: 10 02 13 91                     add     x16, x16, #1216
-# CHECK-NEXT:    10370: 20 02 1f d6                     br      x17
-# CHECK-NEXT:    10374: 1f 20 03 d5                     nop
+# CHECK: 0000000000010380 .plt:
+# CHECK-NEXT:    10380:         bti     c
+# CHECK-NEXT:                   stp     x16, x30, [sp, #-16]!
+# CHECK-NEXT:                   adrp    x16, #131072
+# CHECK-NEXT:                   ldr     x17, [x16, #1272]
+# CHECK-NEXT:                   add     x16, x16, #1272
+# CHECK-NEXT:                   br      x17
+# CHECK-NEXT:                   nop
+# CHECK-NEXT:                   nop
+# CHECK: 00000000000103a0 func1@plt:
+# CHECK-NEXT:    103a0:         bti     c
+# CHECK-NEXT:                   adrp    x16, #131072
+# CHECK-NEXT:                   ldr     x17, [x16, #1280]
+# CHECK-NEXT:                   add     x16, x16, #1280
+# CHECK-NEXT:                   br      x17
+# CHECK-NEXT:                   nop
 # CHECK-NEXT:           ...
-# CHECK: 0000000000010380 myfunc:
-# CHECK-NEXT:    10380: 5f 24 03 d5                     bti     c
-# CHECK-NEXT:    10384: 10 01 00 90                     adrp    x16, #131072
-# CHECK-NEXT:    10388: 11 66 42 f9                     ldr     x17, [x16, #1224]
-# CHECK-NEXT:    1038c: 10 22 13 91                     add     x16, x16, #1224
-# CHECK-NEXT:    10390: 20 02 1f d6                     br      x17
-# CHECK-NEXT:    10394: 1f 20 03 d5                     nop
+# CHECK: 00000000000103c0 myfunc:
+# CHECK-NEXT:    103c0:         bti     c
+# CHECK-NEXT:                   adrp    x16, #131072
+# CHECK-NEXT:                   ldr     x17, [x16, #1288]
+# CHECK-NEXT:                   add     x16, x16, #1288
+# CHECK-NEXT:                   br      x17
+# CHECK-NEXT:                   nop
 
 .section ".note.gnu.property", "a"
 .long 4
diff --git a/lld/test/ELF/aarch64-thunk-pi.s b/lld/test/ELF/aarch64-thunk-pi.s
index 965a93764a730..2545f8fb2ea18 100644
--- a/lld/test/ELF/aarch64-thunk-pi.s
+++ b/lld/test/ELF/aarch64-thunk-pi.s
@@ -16,28 +16,36 @@ low_target:
  bl high_target
  ret
 // CHECK: low_target:
-// CHECK-NEXT:       d8:       bl      #0x10 <__AArch64ADRPThunk_high_target>
+// CHECK-NEXT:       d8:       bl      #0x18 <__AArch64ADRPThunk_high_target>
 // CHECK-NEXT:                 ret
 
  .hidden low_target2
  .globl low_target2
  .type low_target2, %function
 low_target2:
- // Need thunk to high_target
+ // Need thunk to high_target2
  bl high_target2
+ // .text_high+8 = high_target2
+ bl .text_high+8
  ret
 // CHECK: low_target2:
-// CHECK-NEXT:       e0:       bl      #0x14 <__AArch64ADRPThunk_high_target2>
+// CHECK-NEXT:       e0:       bl      #0x1c <__AArch64ADRPThunk_high_target2>
+// CHECK-NEXT:       e4:       bl      #0x24 <__AArch64ADRPThunk_>
 // CHECK-NEXT:                 ret
 
 // Expect range extension thunks for .text_low
 // adrp calculation is (PC + signed immediate) & (!0xfff)
 // CHECK: __AArch64ADRPThunk_high_target:
-// CHECK-NEXT:       e8:       adrp    x16, #0x10000000
+// CHECK-NEXT:       f0:       adrp    x16, #0x10000000
 // CHECK-NEXT:                 add     x16, x16, #0x40
 // CHECK-NEXT:                 br      x16
 // CHECK: __AArch64ADRPThunk_high_target2:
-// CHECK-NEXT:       f4:       adrp    x16, #0x10000000
+// CHECK-NEXT:       fc:       adrp    x16, #0x10000000
+// CHECK-NEXT:                 add     x16, x16, #0x8
+// CHECK-NEXT:                 br      x16
+/// Identical to the previous one, but for the target .text_high+8.
+// CHECK: __AArch64ADRPThunk_:
+// CHECK-NEXT:      108:       adrp    x16, #0x10000000
 // CHECK-NEXT:                 add     x16, x16, #0x8
 // CHECK-NEXT:                 br      x16
 
diff --git a/lld/test/ELF/aarch64-thunk-script.s b/lld/test/ELF/aarch64-thunk-script.s
index cf8187dd5bb35..176c137223b29 100644
--- a/lld/test/ELF/aarch64-thunk-script.s
+++ b/lld/test/ELF/aarch64-thunk-script.s
@@ -15,6 +15,8 @@
 _start:
  // Need thunk to high_target@plt
  bl high_target
+ // Need thunk to .text_high+4
+ bl .text_high+4
  ret
 
  .section .text_high, "ax", %progbits
@@ -28,14 +30,21 @@ high_target:
 // CHECK: Disassembly of section .text_low:
 // CHECK-EMPTY:
 // CHECK-NEXT: _start:
-// CHECK-NEXT:     2000:       bl      #0x8 <__AArch64AbsLongThunk_high_target>
+// CHECK-NEXT:     2000:       bl      #0x10 <__AArch64AbsLongThunk_high_target>
+// CHECK-NEXT:     2004:       bl      #0x1c <__AArch64AbsLongThunk_>
 // CHECK-NEXT:                 ret
 // CHECK: __AArch64AbsLongThunk_high_target:
-// CHECK-NEXT:     2008:       ldr     x16, #0x8
+// CHECK-NEXT:     2010:       ldr     x16, #0x8
 // CHECK-NEXT:                 br      x16
 // CHECK: $d:
-// CHECK-NEXT:     2010:       00 20 00 08     .word   0x08002000
-// CHECK-NEXT:     2014:       00 00 00 00     .word   0x00000000
+// CHECK-NEXT:     2018:       00 20 00 08     .word   0x08002000
+// CHECK-NEXT:     201c:       00 00 00 00     .word   0x00000000
+// CHECK:      __AArch64AbsLongThunk_:
+// CHECK-NEXT:     2020:       ldr x16, #0x8
+// CHECK-NEXT:     2024:       br x16
+// CHECK:      $d:
+// CHECK-NEXT:     2028:       04 20 00 08     .word   0x08002004
+// CHECK-NEXT:     202c:       00 00 00 00     .word   0x00000000
 // CHECK: Disassembly of section .text_high:
 // CHECK-EMPTY:
 // CHECK-NEXT: high_target:
diff --git a/lld/test/ELF/compressed-debug-level.test b/lld/test/ELF/compressed-debug-level.test
new file mode 100644
index 0000000000000..d755e9fedf13b
--- /dev/null
+++ b/lld/test/ELF/compressed-debug-level.test
@@ -0,0 +1,38 @@
+# REQUIRES: x86, zlib
+
+# RUN: yaml2obj %s -o %t.o
+
+# RUN: ld.lld %t.o -o %t.default --compress-debug-sections=zlib
+# RUN: llvm-readelf --sections %t.default | FileCheck -check-prefixes=HEADER,LEVEL1 %s
+
+# RUN: ld.lld -O0 %t.o -o %t.O0 --compress-debug-sections=zlib
+# RUN: llvm-readelf --sections %t.O0 | FileCheck -check-prefixes=HEADER,LEVEL1 %s
+# RUN: cmp %t.default %t.O0
+
+# RUN: ld.lld -O1 %t.o -o %t.O1 --compress-debug-sections=zlib
+# RUN: llvm-readelf --sections %t.O1 | FileCheck -check-prefixes=HEADER,LEVEL1 %s
+# RUN: cmp %t.default %t.O1
+
+# RUN: ld.lld -O2 %t.o -o %t.O2 --compress-debug-sections=zlib
+# RUN: llvm-readelf --sections %t.O2 | FileCheck -check-prefixes=HEADER,LEVEL6 %s
+
+## LLD uses zlib compression of level 1 when -O0, -O1 and level 6 when -O2.
+## Here we check how -O flag affects the size of compressed sections produced.
+
+# HEADER: [Nr] Name        Type     Address  Off    Size
+# LEVEL1: [ 1] .debug_info PROGBITS 00000000 000094 00001c
+# LEVEL6: [ 1] .debug_info PROGBITS 00000000 000094 00001a
+
+## A little arbitrary debug section which has a different size after
+## applying compression of level 1 and 6.
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS32
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_386
+Sections:
+  - Name:    .debug_info
+    Type:    SHT_PROGBITS
+    Content: '010101010101010201010201'
diff --git a/lld/test/ELF/gnu-property-align-32.s b/lld/test/ELF/gnu-property-align-32.s
new file mode 100644
index 0000000000000..8022a49d34c6c
--- /dev/null
+++ b/lld/test/ELF/gnu-property-align-32.s
@@ -0,0 +1,40 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=i686-linux-gnu %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readobj --sections -n %t | FileCheck %s
+
+## Check that .note.gnu.property has alignment 4 and is readable by llvm-readobj
+
+# CHECK: Name: .note.gnu.property
+# CHECK-NEXT: Type: SHT_NOTE (0x7)
+# CHECK-NEXT: Flags [ (0x2)
+# CHECK-NEXT:   SHF_ALLOC (0x2)
+# CHECK-NEXT: ]
+# CHECK-NEXT: Address: 0x4000F4
+# CHECK-NEXT: Offset: 0xF4
+# CHECK-NEXT: Size: 28
+# CHECK-NEXT: Link: 0
+# CHECK-NEXT: Info: 0
+# CHECK-NEXT: AddressAlignment: 4
+
+# CHECK:      Note {
+# CHECK-NEXT:   Owner: GNU
+# CHECK-NEXT:   Data size: 0xC
+# CHECK-NEXT:   Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# CHECK-NEXT:   Property [
+# CHECK-NEXT:     x86 feature: IBT
+
+.section ".note.gnu.property", "a"
+.p2align 2
+.long 4
+.long 0xc
+.long 0x5
+.asciz "GNU"
+.p2align 2
+.long 0xc0000002 # GNU_PROPERTY_X86_FEATURE_1_AND
+.long 4
+.long 1          # GNU_PROPERTY_X86_FEATURE_1_IBT
+
+.text
+.globl _start
+ ret
diff --git a/lld/test/ELF/gnu-property-align.s b/lld/test/ELF/gnu-property-align.s
new file mode 100644
index 0000000000000..b109c09039a2c
--- /dev/null
+++ b/lld/test/ELF/gnu-property-align.s
@@ -0,0 +1,42 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-linux-gnu %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readobj --sections -n %t | FileCheck %s
+
+## Check that .note.gnu.property has alignment 8 and is readable by llvm-readobj
+
+# CHECK:      Name: .note.gnu.property
+# CHECK-NEXT: Type: SHT_NOTE (0x7)
+# CHECK-NEXT: Flags [ (0x2)
+# CHECK-NEXT:   SHF_ALLOC (0x2)
+# CHECK-NEXT: ]
+# CHECK-NEXT: Address: 0x200190
+# CHECK-NEXT: Offset: 0x190
+# CHECK-NEXT: Size: 32
+# CHECK-NEXT: Link: 0
+# CHECK-NEXT: Info: 0
+# CHECK-NEXT: AddressAlignment: 8
+
+# CHECK:      Note {
+# CHECK-NEXT:   Owner: GNU
+# CHECK-NEXT:   Data size: 0x10
+# CHECK-NEXT:   Type: NT_GNU_PROPERTY_TYPE_0 (property note)
+# CHECK-NEXT:   Property [
+# CHECK-NEXT:     x86 feature: IBT
+
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000002 # GNU_PROPERTY_X86_FEATURE_1_AND
+.long 4
+.long 1          # GNU_PROPERTY_X86_FEATURE_1_IBT
+.long 0
+
+ .text
+ .globl _start
+ .type _start, %function
+_start: ret
diff --git a/lld/test/ELF/linkerscript/discard-section-err.s b/lld/test/ELF/linkerscript/discard-section-err.s
index bb77dbb087da3..dd3c666e115b1 100644
--- a/lld/test/ELF/linkerscript/discard-section-err.s
+++ b/lld/test/ELF/linkerscript/discard-section-err.s
@@ -20,8 +20,19 @@
 # RUN: ld.lld -pie -o %t --script %t.script %t.o
 
 # RUN: echo "SECTIONS { /DISCARD/ : { *(.rela.dyn) } }" > %t.script
-# RUN: not ld.lld -pie -o %t --script %t.script %t.o 2>&1 | \
-# RUN:   FileCheck -check-prefix=RELADYN %s
-# RELADYN: discarding .rela.dyn section is not allowed
+# RUN: ld.lld -pie -o %t %t.o
+# RUN: llvm-readobj -S %t | FileCheck --check-prefix=RELADYN %s
+# RELADYN: Name: .rela.dyn
+# RUN: ld.lld -pie -o %t --script %t.script %t.o
+# RUN: llvm-readobj -S %t | FileCheck /dev/null --implicit-check-not='Name: .rela.dyn'
+
+# RUN: echo "SECTIONS { /DISCARD/ : { *(.relr.dyn) } }" > %t.script
+# RUN: not ld.lld -pie --pack-dyn-relocs=relr -o %t --script %t.script %t.o 2>&1 | \
+# RUN:   FileCheck -check-prefix=RELRDYN %s
+# RELRDYN: discarding .relr.dyn section is not allowed
 
-.comm foo,4,4
+.data
+.align 8
+foo:
+## Emits an R_X86_64_RELATIVE in -pie mode.
+.quad foo
diff --git a/lld/test/ELF/pt-gnu-property.s b/lld/test/ELF/pt-gnu-property.s
new file mode 100644
index 0000000000000..5758967b0e0b5
--- /dev/null
+++ b/lld/test/ELF/pt-gnu-property.s
@@ -0,0 +1,45 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-linux-gnu %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-readobj --sections --program-headers %t | FileCheck %s
+
+## Test that we generate the PT_GNU_PROPERTY segment type that describes the
+## .note.gnu.property if it is present.
+
+# CHECK:      Name: .note.gnu.property
+# CHECK-NEXT: Type: SHT_NOTE (0x7)
+# CHECK-NEXT: Flags [ (0x2)
+# CHECK-NEXT:   SHF_ALLOC (0x2)
+# CHECK-NEXT: ]
+# CHECK-NEXT: Address: 0x200190
+# CHECK-NEXT: Offset: 0x190
+# CHECK-NEXT: Size: 32
+# CHECK-NEXT: Link: 0
+# CHECK-NEXT: Info: 0
+# CHECK-NEXT: AddressAlignment: 8
+
+# CHECK:      Type: PT_GNU_PROPERTY (0x6474E553)
+# CHECK-NEXT: Offset: 0x190
+# CHECK-NEXT: VirtualAddress: 0x200190
+# CHECK-NEXT: PhysicalAddress: 0x200190
+# CHECK-NEXT: FileSize: 32
+# CHECK-NEXT: MemSize: 32
+# CHECK-NEXT: Flags [ (0x4)
+# CHECK-NEXT:   PF_R (0x4)
+# CHECK-NEXT: ]
+# CHECK-NEXT: Alignment: 8
+
+.section ".note.gnu.property", "a"
+.long 4
+.long 0x10
+.long 0x5
+.asciz "GNU"
+
+.long 0xc0000002 # GNU_PROPERTY_X86_FEATURE_1_AND
+.long 4
+.long 1          # GNU_PROPERTY_X86_FEATURE_1_IBT
+.long 0
+
+.text
+.globl _start
+ ret
diff --git a/lld/test/ELF/undef-spell-corrector.s b/lld/test/ELF/undef-spell-corrector.s
index 174c8009cba8d..3ad2421a6cd63 100644
--- a/lld/test/ELF/undef-spell-corrector.s
+++ b/lld/test/ELF/undef-spell-corrector.s
@@ -63,6 +63,16 @@
 # CONST-NEXT: >>> referenced by {{.*}}
 # CONST-NEXT: >>> did you mean: foo(int const*)
 
+## Case mismatch.
+# RUN: echo 'call _Z3FOOPKi' | llvm-mc -filetype=obj -triple=x86_64 - -o %t1.o
+# RUN: not ld.lld %t.o %t1.o -o /dev/null 2>&1 | FileCheck --check-prefix=CASE %s
+# RUN: echo '_Z3fooPKi: call _Z3FOOPKi' | llvm-mc -filetype=obj -triple=x86_64 - -o %t1.o
+# RUN: not ld.lld %t1.o -o /dev/null 2>&1 | FileCheck --check-prefix=CASE %s
+
+# CASE:      error: undefined symbol: FOO(int const*)
+# CASE-NEXT: >>> referenced by {{.*}}
+# CASE-NEXT: >>> did you mean: foo(int const*)
+
 .globl _start, abcde, _Z3fooPKi
 _start:
 abcde:
diff --git a/lld/test/ELF/verdef-defaultver.s b/lld/test/ELF/verdef-defaultver.s
index 3c10f2dcfe26f..7d2a0d27fa11d 100644
--- a/lld/test/ELF/verdef-defaultver.s
+++ b/lld/test/ELF/verdef-defaultver.s
@@ -84,6 +84,7 @@
 # DSO-NEXT:      Index: 1
 # DSO-NEXT:      Hash: 127830196
 # DSO-NEXT:      Name: shared
+# DSO-NEXT:      Predecessors: []
 # DSO-NEXT:    }
 # DSO-NEXT:    Definition {
 # DSO-NEXT:      Version: 1
@@ -92,6 +93,7 @@
 # DSO-NEXT:      Index: 2
 # DSO-NEXT:      Hash: 1425
 # DSO-NEXT:      Name: V1
+# DSO-NEXT:      Predecessors: []
 # DSO-NEXT:    }
 # DSO-NEXT:    Definition {
 # DSO-NEXT:      Version: 1
@@ -100,6 +102,7 @@
 # DSO-NEXT:      Index: 3
 # DSO-NEXT:      Hash: 1426
 # DSO-NEXT:      Name: V2
+# DSO-NEXT:      Predecessors: []
 # DSO-NEXT:    }
 # DSO-NEXT:  ]
 
diff --git a/lld/test/ELF/verdef-dependency.s b/lld/test/ELF/verdef-dependency.s
index 479f332d49306..d716436202535 100644
--- a/lld/test/ELF/verdef-dependency.s
+++ b/lld/test/ELF/verdef-dependency.s
@@ -15,6 +15,7 @@
 # DSO-NEXT:     Index: 1
 # DSO-NEXT:     Hash: 127830196
 # DSO-NEXT:     Name: shared
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -23,6 +24,7 @@
 # DSO-NEXT:     Index: 2
 # DSO-NEXT:     Hash: 98457184
 # DSO-NEXT:     Name: LIBSAMPLE_1.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -31,6 +33,7 @@
 # DSO-NEXT:     Index: 3
 # DSO-NEXT:     Hash: 98456416
 # DSO-NEXT:     Name: LIBSAMPLE_2.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -39,5 +42,6 @@
 # DSO-NEXT:     Index: 4
 # DSO-NEXT:     Hash: 98456672
 # DSO-NEXT:     Name: LIBSAMPLE_3.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT: ]
diff --git a/lld/test/ELF/verdef.s b/lld/test/ELF/verdef.s
index d2aa924782f86..dd1f1d41f0148 100644
--- a/lld/test/ELF/verdef.s
+++ b/lld/test/ELF/verdef.s
@@ -33,6 +33,7 @@
 # DSO-NEXT:     Index: 1
 # DSO-NEXT:     Hash: 127830196
 # DSO-NEXT:     Name: shared
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -41,6 +42,7 @@
 # DSO-NEXT:     Index: 2
 # DSO-NEXT:     Hash: 98457184
 # DSO-NEXT:     Name: LIBSAMPLE_1.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -49,6 +51,7 @@
 # DSO-NEXT:     Index: 3
 # DSO-NEXT:     Hash: 98456416
 # DSO-NEXT:     Name: LIBSAMPLE_2.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT:   Definition {
 # DSO-NEXT:     Version: 1
@@ -57,6 +60,7 @@
 # DSO-NEXT:     Index: 4
 # DSO-NEXT:     Hash: 98456672
 # DSO-NEXT:     Name: LIBSAMPLE_3.0
+# DSO-NEXT:     Predecessors: []
 # DSO-NEXT:   }
 # DSO-NEXT: ]
 # DSO-NEXT: VersionRequirements [
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 81d7dd8123bd0..e66fa49a51142 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -119,26 +119,20 @@ if (NOT LLDB_DISABLE_PYTHON)
       "${lldb_scripts_dir}/lldb.py"
       "${lldb_python_build_path}/__init__.py")
 
-  if(APPLE)
-    SET(lldb_python_heap_dir "${lldb_python_build_path}/macosx/heap")
-    add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_heap_dir}
-      COMMAND ${CMAKE_COMMAND} -E copy
-        "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/heap_find.cpp"
-        "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/Makefile"
-        ${lldb_python_heap_dir})
-  endif()
-
-  function(create_python_package target pkg_dir)
-    cmake_parse_arguments(ARG "" "" "FILES" ${ARGN})
+  function(create_python_package pkg_dir)
+    cmake_parse_arguments(ARG "NOINIT" "" "FILES" ${ARGN})
     if(ARG_FILES)
       set(copy_cmd COMMAND ${CMAKE_COMMAND} -E copy ${ARG_FILES} ${pkg_dir})
     endif()
-    add_custom_command(TARGET ${target} POST_BUILD VERBATIM
+    if(NOT ARG_NOINIT)
+      set(init_cmd COMMAND ${PYTHON_EXECUTABLE}
+          "${LLDB_SOURCE_DIR}/scripts/Python/createPythonInit.py"
+          "${pkg_dir}" ${ARG_FILES})
+    endif()
+    add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
       COMMAND ${CMAKE_COMMAND} -E make_directory ${pkg_dir}
       ${copy_cmd}
-      COMMAND ${PYTHON_EXECUTABLE} "${LLDB_SOURCE_DIR}/scripts/Python/createPythonInit.py"
-        ${pkg_dir} ${ARG_FILES}
+      ${init_cmd}
       WORKING_DIRECTORY ${lldb_python_build_path})
   endfunction()
 
@@ -146,28 +140,33 @@ if (NOT LLDB_DISABLE_PYTHON)
     COMMAND ${CMAKE_COMMAND} -E copy
       "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py" ${lldb_python_build_path})
 
-  create_python_package(finish_swig "formatters/cpp"
+  # Distribute the examples as python packages.
+  create_python_package("formatters/cpp"
     FILES "${LLDB_SOURCE_DIR}/examples/synthetic/gnu_libstdcpp.py"
           "${LLDB_SOURCE_DIR}/examples/synthetic/libcxx.py")
-  # Make an empty __init__.py in lldb/runtime as this is required for
-  # Python to recognize lldb.runtime as a valid package (and hence,
-  # lldb.runtime.objc as a valid contained package)
-  create_python_package(finish_swig "runtime")
-  # Having these files copied here ensure that lldb/formatters is a
-  # valid package itself
-  create_python_package(finish_swig "formatters"
+
+  create_python_package("formatters"
     FILES "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/cache.py"
           "${LLDB_SOURCE_DIR}/examples/summaries/synth.py"
           "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/metrics.py"
           "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/attrib_fromdict.py"
           "${LLDB_SOURCE_DIR}/examples/summaries/cocoa/Logger.py")
-  create_python_package(finish_swig "utils"
-    FILES "${LLDB_SOURCE_DIR}/examples/python/symbolication.py")
+
+  create_python_package("utils"
+    FILES "${LLDB_SOURCE_DIR}/examples/python/in_call_stack.py"
+          "${LLDB_SOURCE_DIR}/examples/python/symbolication.py")
+
   if(APPLE)
-    create_python_package(finish_swig "macosx"
+    create_python_package("macosx"
       FILES "${LLDB_SOURCE_DIR}/examples/python/crashlog.py"
             "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap.py")
-    create_python_package(finish_swig "diagnose"
+
+    create_python_package("macosx/heap"
+      FILES "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/heap_find.cpp"
+            "${LLDB_SOURCE_DIR}/examples/darwin/heap_find/heap/Makefile"
+            NOINIT)
+
+    create_python_package("diagnose"
       FILES "${LLDB_SOURCE_DIR}/examples/python/diagnose_unwind.py"
             "${LLDB_SOURCE_DIR}/examples/python/diagnose_nsstring.py")
   endif()
diff --git a/lldb/docs/lldb-gdb-remote.txt b/lldb/docs/lldb-gdb-remote.txt
index e3f11488df640..06cd09d77c412 100644
--- a/lldb/docs/lldb-gdb-remote.txt
+++ b/lldb/docs/lldb-gdb-remote.txt
@@ -790,6 +790,13 @@ distribution_id: optional. For linux, specifies distribution id (e.g. ubuntu, fe
 osmajor: optional, specifies the major version number of the OS (e.g. for macOS 10.12.2, it would be 10)
 osminor: optional, specifies the minor version number of the OS (e.g. for macOS 10.12.2, it would be 12)
 ospatch: optional, specifies the patch level number of the OS (e.g. for macOS 10.12.2, it would be 2)
+addressing_bits: optional, specifies how many bits in addresses are
+		 significant for addressing, base 10.  If bits 38..0
+		 in a 64-bit pointer are significant for addressing,
+		 then the value is 39.  This is needed on e.g. Aarch64
+		 v8.3 ABIs that use pointer authentication, so lldb
+		 knows which bits to clear/set to get the actual
+		 addresses.
 
 //----------------------------------------------------------------------
 // "qGDBServerVersion"
diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index d878b5633e83f..3c6c6e6ffc620 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -880,6 +880,20 @@ Examining Variables
                </td>
          </tr>
 
+         <tr>
+               <td class="header" colspan="2">Print an array of integers in memory, assuming we have a pointer like "int *ptr".</td>
+         </tr>
+         <tr>
+               <td class="content">
+                  <b>(gdb)</b> p *ptr@10
+                  <br>
+               </td>
+               <td class="content">
+                  <b>(lldb)</b> parray 10 ptr
+                  <br>
+               </td>
+         </tr>
+
       </tbody>
    </table>
 
diff --git a/lldb/include/lldb/API/SBReproducer.h b/lldb/include/lldb/API/SBReproducer.h
index 93e567607aa85..8bb530a0fe429 100644
--- a/lldb/include/lldb/API/SBReproducer.h
+++ b/lldb/include/lldb/API/SBReproducer.h
@@ -20,7 +20,7 @@ class LLDB_API SBReproducer {
 public:
   static const char *Capture();
   static const char *Capture(const char *path);
-  static const char *Replay(const char *path);
+  static const char *Replay(const char *path, bool skip_version_check = false);
   static const char *GetPath();
   static bool Generate();
 };
diff --git a/lldb/include/lldb/Breakpoint/BreakpointList.h b/lldb/include/lldb/Breakpoint/BreakpointList.h
index 110e8d41f36b5..ad68151fefc78 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointList.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointList.h
@@ -67,8 +67,10 @@ class BreakpointList {
   ///   The breakpoint name for which to search.
   ///
   /// \result
-  ///   \bfalse if the input name was not a legal breakpoint name.
-  bool FindBreakpointsByName(const char *name, BreakpointList &matching_bps);
+  ///   error if the input name was not a legal breakpoint name, vector
+  ///   of breakpoints otherwise.
+  llvm::Expected<std::vector<lldb::BreakpointSP>>
+  FindBreakpointsByName(const char *name);
 
   /// Returns the number of elements in this breakpoint list.
   ///
diff --git a/lldb/include/lldb/Breakpoint/BreakpointOptions.h b/lldb/include/lldb/Breakpoint/BreakpointOptions.h
index 9e02afff5227e..2c52170eb9f6a 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointOptions.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointOptions.h
@@ -88,7 +88,8 @@ friend class Breakpoint;
     explicit CommandBaton(std::unique_ptr<CommandData> Data)
         : TypedBaton(std::move(Data)) {}
 
-    void GetDescription(Stream *s, lldb::DescriptionLevel level) const override;
+    void GetDescription(llvm::raw_ostream &s, lldb::DescriptionLevel level,
+                        unsigned indentation) const override;
   };
 
   typedef std::shared_ptr<CommandBaton> CommandBatonSP;
diff --git a/lldb/include/lldb/Breakpoint/WatchpointOptions.h b/lldb/include/lldb/Breakpoint/WatchpointOptions.h
index b395dde21901e..0dc34d4ebef73 100644
--- a/lldb/include/lldb/Breakpoint/WatchpointOptions.h
+++ b/lldb/include/lldb/Breakpoint/WatchpointOptions.h
@@ -180,7 +180,8 @@ class WatchpointOptions {
     CommandBaton(std::unique_ptr<CommandData> Data)
         : TypedBaton(std::move(Data)) {}
 
-    void GetDescription(Stream *s, lldb::DescriptionLevel level) const override;
+    void GetDescription(llvm::raw_ostream &s, lldb::DescriptionLevel level,
+                        unsigned indentation) const override;
   };
 
 protected:
diff --git a/lldb/include/lldb/Core/IOHandler.h b/lldb/include/lldb/Core/IOHandler.h
index 04b94da3a8c1e..5c1246751abc0 100644
--- a/lldb/include/lldb/Core/IOHandler.h
+++ b/lldb/include/lldb/Core/IOHandler.h
@@ -456,43 +456,6 @@ class IOHandlerConfirm : public IOHandlerDelegate, public IOHandlerEditline {
   bool m_user_response;
 };
 
-class IOHandlerCursesGUI : public IOHandler {
-public:
-  IOHandlerCursesGUI(Debugger &debugger);
-
-  ~IOHandlerCursesGUI() override;
-
-  void Run() override;
-
-  void Cancel() override;
-
-  bool Interrupt() override;
-
-  void GotEOF() override;
-
-  void Activate() override;
-
-  void Deactivate() override;
-
-protected:
-  curses::ApplicationAP m_app_ap;
-};
-
-class IOHandlerCursesValueObjectList : public IOHandler {
-public:
-  IOHandlerCursesValueObjectList(Debugger &debugger,
-                                 ValueObjectList &valobj_list);
-
-  ~IOHandlerCursesValueObjectList() override;
-
-  void Run() override;
-
-  void GotEOF() override;
-
-protected:
-  ValueObjectList m_valobj_list;
-};
-
 class IOHandlerStack {
 public:
   IOHandlerStack() = default;
diff --git a/lldb/include/lldb/Core/IOHandlerCursesGUI.h b/lldb/include/lldb/Core/IOHandlerCursesGUI.h
new file mode 100644
index 0000000000000..afa4352697255
--- /dev/null
+++ b/lldb/include/lldb/Core/IOHandlerCursesGUI.h
@@ -0,0 +1,40 @@
+//===-- IOHandlerCursesGUI.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef liblldb_IOHandlerCursesGUI_h_
+#define liblldb_IOHandlerCursesGUI_h_
+
+#include "lldb/Core/IOHandler.h"
+
+namespace lldb_private {
+
+class IOHandlerCursesGUI : public IOHandler {
+public:
+  IOHandlerCursesGUI(Debugger &debugger);
+
+  ~IOHandlerCursesGUI() override;
+
+  void Run() override;
+
+  void Cancel() override;
+
+  bool Interrupt() override;
+
+  void GotEOF() override;
+
+  void Activate() override;
+
+  void Deactivate() override;
+
+protected:
+  curses::ApplicationAP m_app_ap;
+};
+
+} // namespace lldb_private
+
+#endif // liblldb_IOHandlerCursesGUI_h_
diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h
index bb6c9bdad760f..2af18c83f23a3 100644
--- a/lldb/include/lldb/Core/Module.h
+++ b/lldb/include/lldb/Core/Module.h
@@ -190,7 +190,7 @@ class Module : public std::enable_shared_from_this<Module>,
   lldb::ModuleSP CalculateSymbolContextModule() override;
 
   void
-  GetDescription(Stream *s,
+  GetDescription(llvm::raw_ostream &s,
                  lldb::DescriptionLevel level = lldb::eDescriptionLevelFull);
 
   /// Get the module path and object name.
diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 651d0dc869bc2..6d024fe3434ba 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -207,7 +207,7 @@ class ModuleSpec {
       if (dumped_something)
         strm.PutCString(", ");
       strm.Printf("arch = ");
-      m_arch.DumpTriple(strm);
+      m_arch.DumpTriple(strm.AsRawOstream());
       dumped_something = true;
     }
     if (m_uuid.IsValid()) {
@@ -251,24 +251,18 @@ class ModuleSpec {
     if (match_module_spec.GetObjectName() &&
         match_module_spec.GetObjectName() != GetObjectName())
       return false;
-    if (match_module_spec.GetFileSpecPtr()) {
-      const FileSpec &fspec = match_module_spec.GetFileSpec();
-      if (!FileSpec::Equal(fspec, GetFileSpec(),
-                           !fspec.GetDirectory().IsEmpty()))
-        return false;
-    }
-    if (GetPlatformFileSpec() && match_module_spec.GetPlatformFileSpecPtr()) {
-      const FileSpec &fspec = match_module_spec.GetPlatformFileSpec();
-      if (!FileSpec::Equal(fspec, GetPlatformFileSpec(),
-                           !fspec.GetDirectory().IsEmpty()))
-        return false;
+    if (!FileSpec::Match(match_module_spec.GetFileSpec(), GetFileSpec()))
+      return false;
+    if (GetPlatformFileSpec() &&
+        !FileSpec::Match(match_module_spec.GetPlatformFileSpec(),
+                         GetPlatformFileSpec())) {
+      return false;
     }
     // Only match the symbol file spec if there is one in this ModuleSpec
-    if (GetSymbolFileSpec() && match_module_spec.GetSymbolFileSpecPtr()) {
-      const FileSpec &fspec = match_module_spec.GetSymbolFileSpec();
-      if (!FileSpec::Equal(fspec, GetSymbolFileSpec(),
-                           !fspec.GetDirectory().IsEmpty()))
-        return false;
+    if (GetSymbolFileSpec() &&
+        !FileSpec::Match(match_module_spec.GetSymbolFileSpec(),
+                         GetSymbolFileSpec())) {
+      return false;
     }
     if (match_module_spec.GetArchitecturePtr()) {
       if (exact_arch_match) {
diff --git a/lldb/include/lldb/Core/STLUtils.h b/lldb/include/lldb/Core/STLUtils.h
deleted file mode 100644
index f9500aa5594ed..0000000000000
--- a/lldb/include/lldb/Core/STLUtils.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- STLUtils.h ----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef liblldb_STLUtils_h_
-#define liblldb_STLUtils_h_
-
-#include <string.h>
-
-#include <map>
-#include <ostream>
-#include <vector>
-
-
-// C string less than compare function object
-struct CStringCompareFunctionObject {
-  bool operator()(const char *s1, const char *s2) const {
-    return strcmp(s1, s2) < 0;
-  }
-};
-
-#endif // liblldb_STLUtils_h_
diff --git a/lldb/include/lldb/Core/SourceManager.h b/lldb/include/lldb/Core/SourceManager.h
index bca817750d8da..f1f56d0886c3a 100644
--- a/lldb/include/lldb/Core/SourceManager.h
+++ b/lldb/include/lldb/Core/SourceManager.h
@@ -54,8 +54,6 @@ class SourceManager {
 
     bool LineIsValid(uint32_t line);
 
-    bool FileSpecMatches(const FileSpec &file_spec);
-
     const FileSpec &GetFileSpec() { return m_file_spec; }
 
     uint32_t GetSourceMapModificationID() const { return m_source_map_mod_id; }
diff --git a/lldb/include/lldb/Core/ThreadSafeDenseMap.h b/lldb/include/lldb/Core/ThreadSafeDenseMap.h
index c485b91acb47a..420cb57635865 100644
--- a/lldb/include/lldb/Core/ThreadSafeDenseMap.h
+++ b/lldb/include/lldb/Core/ThreadSafeDenseMap.h
@@ -62,4 +62,4 @@ class ThreadSafeDenseMap {
 
 } // namespace lldb_private
 
-#endif // liblldb_ThreadSafeSTLMap_h_
+#endif // liblldb_ThreadSafeDenseMap_h_
diff --git a/lldb/include/lldb/Core/ThreadSafeSTLMap.h b/lldb/include/lldb/Core/ThreadSafeSTLMap.h
deleted file mode 100644
index df0208cd49b31..0000000000000
--- a/lldb/include/lldb/Core/ThreadSafeSTLMap.h
+++ /dev/null
@@ -1,128 +0,0 @@
-//===-- ThreadSafeSTLMap.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef liblldb_ThreadSafeSTLMap_h_
-#define liblldb_ThreadSafeSTLMap_h_
-
-#include <map>
-#include <mutex>
-
-#include "lldb/lldb-defines.h"
-
-namespace lldb_private {
-
-template <typename _Key, typename _Tp> class ThreadSafeSTLMap {
-public:
-  typedef std::map<_Key, _Tp> collection;
-  typedef typename collection::iterator iterator;
-  typedef typename collection::const_iterator const_iterator;
-  // Constructors and Destructors
-  ThreadSafeSTLMap() : m_collection(), m_mutex() {}
-
-  ~ThreadSafeSTLMap() {}
-
-  bool IsEmpty() const {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.empty();
-  }
-
-  void Clear() {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.clear();
-  }
-
-  size_t Erase(const _Key &key) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return EraseNoLock(key);
-  }
-
-  size_t EraseNoLock(const _Key &key) { return m_collection.erase(key); }
-
-  bool GetValueForKey(const _Key &key, _Tp &value) const {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return GetValueForKeyNoLock(key, value);
-  }
-
-  // Call this if you have already manually locked the mutex using the
-  // GetMutex() accessor
-  bool GetValueForKeyNoLock(const _Key &key, _Tp &value) const {
-    const_iterator pos = m_collection.find(key);
-    if (pos != m_collection.end()) {
-      value = pos->second;
-      return true;
-    }
-    return false;
-  }
-
-  bool GetFirstKeyForValue(const _Tp &value, _Key &key) const {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return GetFirstKeyForValueNoLock(value, key);
-  }
-
-  bool GetFirstKeyForValueNoLock(const _Tp &value, _Key &key) const {
-    const_iterator pos, end = m_collection.end();
-    for (pos = m_collection.begin(); pos != end; ++pos) {
-      if (pos->second == value) {
-        key = pos->first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  bool LowerBound(const _Key &key, _Key &match_key, _Tp &match_value,
-                  bool decrement_if_not_equal) const {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return LowerBoundNoLock(key, match_key, match_value,
-                            decrement_if_not_equal);
-  }
-
-  bool LowerBoundNoLock(const _Key &key, _Key &match_key, _Tp &match_value,
-                        bool decrement_if_not_equal) const {
-    const_iterator pos = m_collection.lower_bound(key);
-    if (pos != m_collection.end()) {
-      match_key = pos->first;
-      if (decrement_if_not_equal && key != match_key &&
-          pos != m_collection.begin()) {
-        --pos;
-        match_key = pos->first;
-      }
-      match_value = pos->second;
-      return true;
-    }
-    return false;
-  }
-
-  iterator lower_bound_unsafe(const _Key &key) {
-    return m_collection.lower_bound(key);
-  }
-
-  void SetValueForKey(const _Key &key, const _Tp &value) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    SetValueForKeyNoLock(key, value);
-  }
-
-  // Call this if you have already manually locked the mutex using the
-  // GetMutex() accessor
-  void SetValueForKeyNoLock(const _Key &key, const _Tp &value) {
-    m_collection[key] = value;
-  }
-
-  std::recursive_mutex &GetMutex() { return m_mutex; }
-
-private:
-  collection m_collection;
-  mutable std::recursive_mutex m_mutex;
-
-  // For ThreadSafeSTLMap only
-  DISALLOW_COPY_AND_ASSIGN(ThreadSafeSTLMap);
-};
-
-} // namespace lldb_private
-
-#endif // liblldb_ThreadSafeSTLMap_h_
diff --git a/lldb/include/lldb/Core/ThreadSafeSTLVector.h b/lldb/include/lldb/Core/ThreadSafeSTLVector.h
deleted file mode 100644
index e1666a69ef7ea..0000000000000
--- a/lldb/include/lldb/Core/ThreadSafeSTLVector.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//===-- ThreadSafeSTLVector.h ------------------------------------*- C++
-//-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef liblldb_ThreadSafeSTLVector_h_
-#define liblldb_ThreadSafeSTLVector_h_
-
-#include <mutex>
-#include <vector>
-
-#include "lldb/lldb-defines.h"
-
-namespace lldb_private {
-
-template <typename _Object> class ThreadSafeSTLVector {
-public:
-  typedef std::vector<_Object> collection;
-  typedef typename collection::iterator iterator;
-  typedef typename collection::const_iterator const_iterator;
-  // Constructors and Destructors
-  ThreadSafeSTLVector() : m_collection(), m_mutex() {}
-
-  ~ThreadSafeSTLVector() = default;
-
-  bool IsEmpty() const {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.empty();
-  }
-
-  void Clear() {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.clear();
-  }
-
-  size_t GetCount() {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.size();
-  }
-
-  void AppendObject(_Object &object) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    m_collection.push_back(object);
-  }
-
-  _Object GetObject(size_t index) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    return m_collection.at(index);
-  }
-
-  void SetObject(size_t index, const _Object &object) {
-    std::lock_guard<std::recursive_mutex> guard(m_mutex);
-    m_collection.at(index) = object;
-  }
-
-  std::recursive_mutex &GetMutex() { return m_mutex; }
-
-private:
-  collection m_collection;
-  mutable std::recursive_mutex m_mutex;
-
-  // For ThreadSafeSTLVector only
-  DISALLOW_COPY_AND_ASSIGN(ThreadSafeSTLVector);
-};
-
-} // namespace lldb_private
-
-#endif // liblldb_ThreadSafeSTLVector_h_
diff --git a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h
index 3b14a3e9f3885..ec395095351d0 100644
--- a/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h
+++ b/lldb/include/lldb/Core/ValueObjectSyntheticFilter.h
@@ -9,8 +9,6 @@
 #ifndef liblldb_ValueObjectSyntheticFilter_h_
 #define liblldb_ValueObjectSyntheticFilter_h_
 
-#include "lldb/Core/ThreadSafeSTLMap.h"
-#include "lldb/Core/ThreadSafeSTLVector.h"
 #include "lldb/Core/ValueObject.h"
 #include "lldb/Symbol/CompilerType.h"
 #include "lldb/Utility/ConstString.h"
@@ -135,19 +133,24 @@ class ValueObjectSynthetic : public ValueObject {
   lldb::SyntheticChildrenSP m_synth_sp;
   std::unique_ptr<SyntheticChildrenFrontEnd> m_synth_filter_up;
 
-  typedef ThreadSafeSTLMap<uint32_t, ValueObject *> ByIndexMap;
-  typedef ThreadSafeSTLMap<const char *, uint32_t> NameToIndexMap;
-  typedef ThreadSafeSTLVector<lldb::ValueObjectSP> SyntheticChildrenCache;
+  typedef std::map<uint32_t, ValueObject *> ByIndexMap;
+  typedef std::map<const char *, uint32_t> NameToIndexMap;
+  typedef std::vector<lldb::ValueObjectSP> SyntheticChildrenCache;
 
   typedef ByIndexMap::iterator ByIndexIterator;
   typedef NameToIndexMap::iterator NameToIndexIterator;
 
+  std::mutex m_child_mutex;
+  /// Guarded by m_child_mutex;
   ByIndexMap m_children_byindex;
+  /// Guarded by m_child_mutex;
   NameToIndexMap m_name_toindex;
+  /// Guarded by m_child_mutex;
+  SyntheticChildrenCache m_synthetic_children_cache;
+
   uint32_t m_synthetic_children_count; // FIXME use the ValueObject's
                                        // ChildrenManager instead of a special
                                        // purpose solution
-  SyntheticChildrenCache m_synthetic_children_cache;
 
   ConstString m_parent_type_name;
 
diff --git a/lldb/include/lldb/DataFormatters/FormatManager.h b/lldb/include/lldb/DataFormatters/FormatManager.h
index afaafda47e761..66df8397dfee4 100644
--- a/lldb/include/lldb/DataFormatters/FormatManager.h
+++ b/lldb/include/lldb/DataFormatters/FormatManager.h
@@ -52,24 +52,15 @@ class FormatManager : public IFormatChangeListener {
   void
   EnableCategory(ConstString category_name,
                  TypeCategoryMap::Position pos = TypeCategoryMap::Default) {
-    EnableCategory(category_name, pos,
-                   std::initializer_list<lldb::LanguageType>());
+    EnableCategory(category_name, pos, {});
   }
 
   void EnableCategory(ConstString category_name,
                       TypeCategoryMap::Position pos, lldb::LanguageType lang) {
-    std::initializer_list<lldb::LanguageType> langs = {lang};
-    EnableCategory(category_name, pos, langs);
-  }
-
-  void EnableCategory(ConstString category_name,
-                      TypeCategoryMap::Position pos = TypeCategoryMap::Default,
-                      std::initializer_list<lldb::LanguageType> langs = {}) {
     TypeCategoryMap::ValueSP category_sp;
     if (m_categories_map.Get(category_name, category_sp) && category_sp) {
       m_categories_map.Enable(category_sp, pos);
-      for (const lldb::LanguageType lang : langs)
-        category_sp->AddLanguage(lang);
+      category_sp->AddLanguage(lang);
     }
   }
 
diff --git a/lldb/include/lldb/DataFormatters/TypeCategory.h b/lldb/include/lldb/DataFormatters/TypeCategory.h
index a5438226bbbb8..dc5edb6549407 100644
--- a/lldb/include/lldb/DataFormatters/TypeCategory.h
+++ b/lldb/include/lldb/DataFormatters/TypeCategory.h
@@ -214,8 +214,7 @@ class TypeCategoryImpl {
     ValidatorContainer::RegexMatchForEachCallback m_validator_regex;
   };
 
-  TypeCategoryImpl(IFormatChangeListener *clist, ConstString name,
-                   std::initializer_list<lldb::LanguageType> langs = {});
+  TypeCategoryImpl(IFormatChangeListener *clist, ConstString name);
 
   template <typename T> void ForEach(const ForEachCallbacks<T> &foreach) {
     GetTypeFormatsContainer()->ForEach(foreach.GetFormatExactCallback());
@@ -359,8 +358,6 @@ class TypeCategoryImpl {
 
   void AddLanguage(lldb::LanguageType lang);
 
-  bool HasLanguage(lldb::LanguageType lang);
-
   std::string GetDescription();
 
   bool AnyMatches(ConstString type_name,
diff --git a/lldb/include/lldb/Host/Editline.h b/lldb/include/lldb/Host/Editline.h
index 65bf15531bc46..0cb2c6c5b6a14 100644
--- a/lldb/include/lldb/Host/Editline.h
+++ b/lldb/include/lldb/Host/Editline.h
@@ -133,6 +133,15 @@ enum class CursorLocation {
   /// session
   BlockEnd
 };
+
+/// Operation for the history.
+enum class HistoryOperation {
+  Oldest,
+  Older,
+  Current,
+  Newer,
+  Newest
+};
 }
 
 using namespace line_editor;
@@ -258,11 +267,7 @@ class Editline {
   StringList GetInputAsStringList(int line_count = UINT32_MAX);
 
   /// Replaces the current multi-line session with the next entry from history.
-  /// When the parameter is
-  /// true it will take the next earlier entry from history, when it is false it
-  /// takes the next most
-  /// recent.
-  unsigned char RecallHistory(bool earlier);
+  unsigned char RecallHistory(HistoryOperation op);
 
   /// Character reading implementation for EditLine that supports our multi-line
   /// editing trickery.
diff --git a/lldb/include/lldb/Interpreter/CommandReturnObject.h b/lldb/include/lldb/Interpreter/CommandReturnObject.h
index 61e57fb798a1d..8af76e07e5ae1 100644
--- a/lldb/include/lldb/Interpreter/CommandReturnObject.h
+++ b/lldb/include/lldb/Interpreter/CommandReturnObject.h
@@ -9,7 +9,6 @@
 #ifndef liblldb_CommandReturnObject_h_
 #define liblldb_CommandReturnObject_h_
 
-#include "lldb/Core/STLUtils.h"
 #include "lldb/Core/StreamFile.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/Utility/StreamTee.h"
diff --git a/lldb/include/lldb/Symbol/ClangASTContext.h b/lldb/include/lldb/Symbol/ClangASTContext.h
index f4428c6821825..b2c284282f11e 100644
--- a/lldb/include/lldb/Symbol/ClangASTContext.h
+++ b/lldb/include/lldb/Symbol/ClangASTContext.h
@@ -41,15 +41,17 @@ namespace lldb_private {
 class Declaration;
 
 class ClangASTContext : public TypeSystem {
+  // LLVM RTTI support
+  static char ID;
+
 public:
   typedef void (*CompleteTagDeclCallback)(void *baton, clang::TagDecl *);
   typedef void (*CompleteObjCInterfaceDeclCallback)(void *baton,
                                                     clang::ObjCInterfaceDecl *);
 
   // llvm casting support
-  static bool classof(const TypeSystem *ts) {
-    return ts->getKind() == TypeSystem::eKindClang;
-  }
+  bool isA(const void *ClassID) const override { return ClassID == &ID; }
+  static bool classof(const TypeSystem *ts) { return ts->isA(&ID); }
 
   // Constructors and Destructors
   explicit ClangASTContext(llvm::StringRef triple = "");
@@ -148,13 +150,8 @@ class ClangASTContext : public TypeSystem {
   CompilerType GetBuiltinTypeForEncodingAndBitSize(lldb::Encoding encoding,
                                                    size_t bit_size) override;
 
-  static CompilerType GetBuiltinTypeForEncodingAndBitSize(
-      clang::ASTContext *ast, lldb::Encoding encoding, uint32_t bit_size);
-
   CompilerType GetBasicType(lldb::BasicType type);
 
-  CompilerType GetBasicType(ConstString name);
-
   static lldb::BasicType GetBasicTypeEnumeration(ConstString name);
 
   CompilerType GetBuiltinTypeForDWARFEncodingAndBitSize(const char *type_name,
@@ -906,7 +903,8 @@ class ClangASTContext : public TypeSystem {
 
   static clang::TypedefNameDecl *GetAsTypedefDecl(const CompilerType &type);
 
-  clang::CXXRecordDecl *GetAsCXXRecordDecl(lldb::opaque_compiler_type_t type);
+  static clang::CXXRecordDecl *
+  GetAsCXXRecordDecl(lldb::opaque_compiler_type_t type);
 
   static clang::ObjCInterfaceDecl *
   GetAsObjCInterfaceDecl(const CompilerType &type);
diff --git a/lldb/include/lldb/Symbol/CompileUnit.h b/lldb/include/lldb/Symbol/CompileUnit.h
index 7efbf792b1a92..aec5cc7c8743b 100644
--- a/lldb/include/lldb/Symbol/CompileUnit.h
+++ b/lldb/include/lldb/Symbol/CompileUnit.h
@@ -13,6 +13,7 @@
 #include "lldb/Core/ModuleChild.h"
 #include "lldb/Symbol/DebugMacros.h"
 #include "lldb/Symbol/Function.h"
+#include "lldb/Symbol/LineTable.h"
 #include "lldb/Symbol/SourceModule.h"
 #include "lldb/Utility/Stream.h"
 #include "lldb/Utility/UserID.h"
@@ -35,7 +36,6 @@ namespace lldb_private {
 /// table.
 class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
                     public ModuleChild,
-                    public FileSpec,
                     public UserID,
                     public SymbolContextScope {
 public:
@@ -116,9 +116,6 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
               const FileSpec &file_spec, lldb::user_id_t uid,
               lldb::LanguageType language, lldb_private::LazyBool is_optimized);
 
-  /// Destructor
-  ~CompileUnit() override;
-
   /// Add a function to this compile unit.
   ///
   /// Typically called by the SymbolFile plug-ins as they partially parse the
@@ -225,6 +222,9 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
                          const FileSpec *file_spec_ptr, bool exact,
                          LineEntry *line_entry);
 
+  /// Return the primary source file associated with this compile unit.
+  const FileSpec &GetPrimaryFile() const { return m_file_spec; }
+
   /// Get the line table for the compile unit.
   ///
   /// Called by clients and the SymbolFile plug-in. The SymbolFile plug-ins
@@ -381,14 +381,11 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   ///     A SymbolContext list class that will get any matching
   ///     entries appended to.
   ///
-  /// \return
-  ///     The number of new matches that were added to \a sc_list.
-  ///
   /// \see enum SymbolContext::Scope
-  uint32_t ResolveSymbolContext(const FileSpec &file_spec, uint32_t line,
-                                bool check_inlines, bool exact,
-                                lldb::SymbolContextItem resolve_scope,
-                                SymbolContextList &sc_list);
+  void ResolveSymbolContext(const FileSpec &file_spec, uint32_t line,
+                            bool check_inlines, bool exact,
+                            lldb::SymbolContextItem resolve_scope,
+                            SymbolContextList &sc_list);
 
   /// Get whether compiler optimizations were enabled for this compile unit
   ///
@@ -418,6 +415,8 @@ class CompileUnit : public std::enable_shared_from_this<CompileUnit>,
   /// All modules, including the current module, imported by this
   /// compile unit.
   std::vector<SourceModule> m_imported_modules;
+  /// The primary file associated with this compile unit.
+  FileSpec m_file_spec;
   /// Files associated with this compile unit's line table and
   /// declarations.
   FileSpecList m_support_files;
diff --git a/lldb/include/lldb/Symbol/CompilerDecl.h b/lldb/include/lldb/Symbol/CompilerDecl.h
index 4817ec4b22670..e4687ffb38536 100644
--- a/lldb/include/lldb/Symbol/CompilerDecl.h
+++ b/lldb/include/lldb/Symbol/CompilerDecl.h
@@ -18,13 +18,11 @@ namespace lldb_private {
 class CompilerDecl {
 public:
   // Constructors and Destructors
-  CompilerDecl() : m_type_system(nullptr), m_opaque_decl(nullptr) {}
+  CompilerDecl() = default;
 
   CompilerDecl(TypeSystem *type_system, void *decl)
       : m_type_system(type_system), m_opaque_decl(decl) {}
 
-  ~CompilerDecl() {}
-
   // Tests
 
   explicit operator bool() const { return IsValid(); }
@@ -39,8 +37,6 @@ class CompilerDecl {
     return m_type_system != nullptr && m_opaque_decl != nullptr;
   }
 
-  bool IsClang() const;
-
   // Accessors
 
   TypeSystem *GetTypeSystem() const { return m_type_system; }
@@ -75,8 +71,8 @@ class CompilerDecl {
   CompilerType GetFunctionArgumentType(size_t arg_idx) const;
 
 private:
-  TypeSystem *m_type_system;
-  void *m_opaque_decl;
+  TypeSystem *m_type_system = nullptr;
+  void *m_opaque_decl = nullptr;
 };
 
 bool operator==(const CompilerDecl &lhs, const CompilerDecl &rhs);
diff --git a/lldb/include/lldb/Symbol/CompilerDeclContext.h b/lldb/include/lldb/Symbol/CompilerDeclContext.h
index e7958c08d8334..fe8539ab30e68 100644
--- a/lldb/include/lldb/Symbol/CompilerDeclContext.h
+++ b/lldb/include/lldb/Symbol/CompilerDeclContext.h
@@ -19,13 +19,11 @@ namespace lldb_private {
 class CompilerDeclContext {
 public:
   // Constructors and Destructors
-  CompilerDeclContext() : m_type_system(nullptr), m_opaque_decl_ctx(nullptr) {}
+  CompilerDeclContext() = default;
 
   CompilerDeclContext(TypeSystem *type_system, void *decl_ctx)
       : m_type_system(type_system), m_opaque_decl_ctx(decl_ctx) {}
 
-  ~CompilerDeclContext() {}
-
   // Tests
 
   explicit operator bool() const { return IsValid(); }
@@ -40,8 +38,6 @@ class CompilerDeclContext {
     return m_type_system != nullptr && m_opaque_decl_ctx != nullptr;
   }
 
-  bool IsClang() const;
-
   std::vector<CompilerDecl> FindDeclByName(ConstString name,
                                            const bool ignore_using_decls);
 
@@ -105,8 +101,8 @@ class CompilerDeclContext {
   bool IsStructUnionOrClass() const;
 
 private:
-  TypeSystem *m_type_system;
-  void *m_opaque_decl_ctx;
+  TypeSystem *m_type_system = nullptr;
+  void *m_opaque_decl_ctx = nullptr;
 };
 
 bool operator==(const CompilerDeclContext &lhs, const CompilerDeclContext &rhs);
diff --git a/lldb/include/lldb/Symbol/CompilerType.h b/lldb/include/lldb/Symbol/CompilerType.h
index cedd2523a5a89..91d9c5e48d20c 100644
--- a/lldb/include/lldb/Symbol/CompilerType.h
+++ b/lldb/include/lldb/Symbol/CompilerType.h
@@ -357,14 +357,6 @@ class CompilerType {
   bool GetValueAsScalar(const DataExtractor &data, lldb::offset_t data_offset,
                         size_t data_byte_size, Scalar &value) const;
 
-  bool SetValueFromScalar(const Scalar &value, Stream &strm);
-
-  bool ReadFromMemory(ExecutionContext *exe_ctx, lldb::addr_t addr,
-                      AddressType address_type, DataExtractor &data);
-
-  bool WriteToMemory(ExecutionContext *exe_ctx, lldb::addr_t addr,
-                     AddressType address_type, StreamString &new_value);
-
   void Clear() {
     m_type = nullptr;
     m_type_system = nullptr;
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 6283d67baba52..ea860647fdb1c 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -52,47 +52,11 @@ struct LanguageSet {
 /// Interface for representing the Type Systems in different languages.
 class TypeSystem : public PluginInterface {
 public:
-  // Intrusive type system that allows us to use llvm casting.
-  //
-  // To add a new type system:
-  //
-  // 1 - Add a new enumeration for llvm casting below for your TypeSystem
-  //     subclass, here we will use eKindFoo
-  //
-  // 2 - Your TypeSystem subclass will inherit from TypeSystem and needs
-  //     to implement a static classof() function that returns your
-  //     enumeration:
-  //
-  //    class Foo : public lldb_private::TypeSystem
-  //    {
-  //        static bool classof(const TypeSystem *ts)
-  //        {
-  //            return ts->getKind() == TypeSystem::eKindFoo;
-  //        }
-  //    };
-  //
-  // 3 - Contruct your TypeSystem subclass with the enumeration from below
-  //
-  //    Foo() :
-  //        TypeSystem(TypeSystem::eKindFoo),
-  //        ...
-  //    {
-  //    }
-  //
-  // Then you can use the llvm casting on any "TypeSystem *" to get an instance
-  // of your subclass.
-  enum LLVMCastKind {
-    eKindClang,
-    eKindSwift,
-    kNumKinds
-  };
-
   // Constructors and Destructors
-  TypeSystem(LLVMCastKind kind);
-
   ~TypeSystem() override;
 
-  LLVMCastKind getKind() const { return m_kind; }
+  // LLVM RTTI support
+  virtual bool isA(const void *ClassID) const = 0;
 
   static lldb::TypeSystemSP CreateInstance(lldb::LanguageType language,
                                            Module *module);
@@ -493,8 +457,7 @@ class TypeSystem : public PluginInterface {
   virtual bool IsMeaninglessWithoutDynamicResolution(void *type);
 
 protected:
-  const LLVMCastKind m_kind; // Support for llvm casting
-  SymbolFile *m_sym_file;
+  SymbolFile *m_sym_file = nullptr;
 };
 
 class TypeSystemMap {
diff --git a/lldb/include/lldb/Target/ABI.h b/lldb/include/lldb/Target/ABI.h
index 93378abc2ac2c..1aff1e2f78174 100644
--- a/lldb/include/lldb/Target/ABI.h
+++ b/lldb/include/lldb/Target/ABI.h
@@ -126,12 +126,7 @@ class ABI : public PluginInterface {
 
   llvm::MCRegisterInfo &GetMCRegisterInfo() { return *m_mc_register_info_up; }
 
-  virtual const RegisterInfo *GetRegisterInfoArray(uint32_t &count) = 0;
-
-  bool GetRegisterInfoByName(ConstString name, RegisterInfo &info);
-
-  bool GetRegisterInfoByKind(lldb::RegisterKind reg_kind, uint32_t reg_num,
-                             RegisterInfo &info);
+  virtual void AugmentRegisterInfo(RegisterInfo &info);
 
   virtual bool GetPointerReturnRegister(const char *&name) { return false; }
 
@@ -143,6 +138,10 @@ class ABI : public PluginInterface {
     assert(m_mc_register_info_up && "ABI must have MCRegisterInfo");
   }
 
+  bool GetRegisterInfoByName(ConstString name, RegisterInfo &info);
+
+  virtual const RegisterInfo *GetRegisterInfoArray(uint32_t &count) = 0;
+
   /// Utility function to construct a MCRegisterInfo using the ArchSpec triple.
   /// Plugins wishing to customize the construction can construct the
   /// MCRegisterInfo themselves.
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 81181a831a492..47c5c78704052 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -2185,11 +2185,9 @@ class Process : public std::enable_shared_from_this<Process>,
 
   OperatingSystem *GetOperatingSystem() { return m_os_up.get(); }
 
-  std::vector<LanguageRuntime *>
-  GetLanguageRuntimes(bool retry_if_null = true);
+  std::vector<LanguageRuntime *> GetLanguageRuntimes();
 
-  LanguageRuntime *GetLanguageRuntime(lldb::LanguageType language,
-                                      bool retry_if_null = true);
+  LanguageRuntime *GetLanguageRuntime(lldb::LanguageType language);
 
   bool IsPossibleDynamicValue(ValueObject &in_value);
 
diff --git a/lldb/include/lldb/Utility/ArchSpec.h b/lldb/include/lldb/Utility/ArchSpec.h
index ae79583768320..15e2fdb10c324 100644
--- a/lldb/include/lldb/Utility/ArchSpec.h
+++ b/lldb/include/lldb/Utility/ArchSpec.h
@@ -433,7 +433,7 @@ class ArchSpec {
   /// \return A triple describing this ArchSpec.
   const llvm::Triple &GetTriple() const { return m_triple; }
 
-  void DumpTriple(Stream &s) const;
+  void DumpTriple(llvm::raw_ostream &s) const;
 
   /// Architecture triple setter.
   ///
diff --git a/lldb/include/lldb/Utility/Baton.h b/lldb/include/lldb/Utility/Baton.h
index 4050f2af2bf04..c42867489c65d 100644
--- a/lldb/include/lldb/Utility/Baton.h
+++ b/lldb/include/lldb/Utility/Baton.h
@@ -12,6 +12,8 @@
 #include "lldb/lldb-enumerations.h"
 #include "lldb/lldb-public.h"
 
+#include "llvm/Support/raw_ostream.h"
+
 #include <memory>
 
 namespace lldb_private {
@@ -37,8 +39,9 @@ class Baton {
 
   virtual void *data() = 0;
 
-  virtual void GetDescription(Stream *s,
-                              lldb::DescriptionLevel level) const = 0;
+  virtual void GetDescription(llvm::raw_ostream &s,
+                              lldb::DescriptionLevel level,
+                              unsigned indentation) const = 0;
 };
 
 class UntypedBaton : public Baton {
@@ -50,7 +53,8 @@ class UntypedBaton : public Baton {
   }
 
   void *data() override { return m_data; }
-  void GetDescription(Stream *s, lldb::DescriptionLevel level) const override;
+  void GetDescription(llvm::raw_ostream &s, lldb::DescriptionLevel level,
+                      unsigned indentation) const override;
 
   void *m_data; // Leave baton public for easy access
 };
@@ -63,7 +67,8 @@ template <typename T> class TypedBaton : public Baton {
   const T *getItem() const { return Item.get(); }
 
   void *data() override { return Item.get(); }
-  void GetDescription(Stream *s, lldb::DescriptionLevel level) const override {}
+  void GetDescription(llvm::raw_ostream &s, lldb::DescriptionLevel level,
+                      unsigned indentation) const override {}
 
 protected:
   std::unique_ptr<T> Item;
diff --git a/lldb/include/lldb/Utility/FileSpec.h b/lldb/include/lldb/Utility/FileSpec.h
index 53b0a9c08699c..61b6209bb3c02 100644
--- a/lldb/include/lldb/Utility/FileSpec.h
+++ b/lldb/include/lldb/Utility/FileSpec.h
@@ -75,18 +75,6 @@ class FileSpec {
 
   explicit FileSpec(llvm::StringRef path, const llvm::Triple &triple);
 
-  /// Copy constructor
-  ///
-  /// Makes a copy of the uniqued directory and filename strings from \a rhs
-  /// if it is not nullptr.
-  ///
-  /// \param[in] rhs
-  ///     A const FileSpec object pointer to copy if non-nullptr.
-  FileSpec(const FileSpec *rhs);
-
-  /// Destructor.
-  ~FileSpec();
-
   bool DirectoryEquals(const FileSpec &other) const;
 
   bool FileEquals(const FileSpec &other) const;
@@ -195,6 +183,12 @@ class FileSpec {
 
   static bool Equal(const FileSpec &a, const FileSpec &b, bool full);
 
+  /// Match FileSpec \a pattern against FileSpec \a file. If \a pattern has a
+  /// directory component, then the \a file must have the same directory
+  /// component. Otherwise, just it matches just the filename. An empty \a
+  /// pattern matches everything.
+  static bool Match(const FileSpec &pattern, const FileSpec &file);
+
   /// Attempt to guess path style for a given path string. It returns a style,
   /// if it was able to make a reasonable guess, or None if it wasn't. The guess
   /// will be correct if the input path was a valid absolute path on the system
diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 414f921773030..a3a33178086e7 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -213,85 +213,14 @@ class Stream {
   ///     in one statement.
   Stream &operator<<(char ch);
 
-  /// Output a uint8_t \a uval to the stream \a s.
-  ///
-  /// \param[in] uval
-  ///     A uint8_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(uint8_t uval);
-
-  /// Output a uint16_t \a uval to the stream \a s.
-  ///
-  /// \param[in] uval
-  ///     A uint16_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(uint16_t uval);
-
-  /// Output a uint32_t \a uval to the stream \a s.
-  ///
-  /// \param[in] uval
-  ///     A uint32_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(uint32_t uval);
-
-  /// Output a uint64_t \a uval to the stream \a s.
-  ///
-  /// \param[in] uval
-  ///     A uint64_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(uint64_t uval);
-
-  /// Output a int8_t \a sval to the stream \a s.
-  ///
-  /// \param[in] sval
-  ///     A int8_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(int8_t sval);
-
-  /// Output a int16_t \a sval to the stream \a s.
-  ///
-  /// \param[in] sval
-  ///     A int16_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(int16_t sval);
-
-  /// Output a int32_t \a sval to the stream \a s.
-  ///
-  /// \param[in] sval
-  ///     A int32_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(int32_t sval);
-
-  /// Output a int64_t \a sval to the stream \a s.
-  ///
-  /// \param[in] sval
-  ///     A int64_t value.
-  ///
-  /// \return
-  ///     A reference to this class so multiple things can be streamed
-  ///     in one statement.
-  Stream &operator<<(int64_t sval);
+  Stream &operator<<(uint8_t uval) = delete;
+  Stream &operator<<(uint16_t uval) = delete;
+  Stream &operator<<(uint32_t uval) = delete;
+  Stream &operator<<(uint64_t uval) = delete;
+  Stream &operator<<(int8_t sval) = delete;
+  Stream &operator<<(int16_t sval) = delete;
+  Stream &operator<<(int32_t sval) = delete;
+  Stream &operator<<(int64_t sval) = delete;
 
   /// Output an address value to this stream.
   ///
@@ -373,8 +302,8 @@ class Stream {
   /// Get the current indentation level.
   ///
   /// \return
-  ///     The current indentation level as an integer.
-  int GetIndentLevel() const;
+  ///     The current indentation level.
+  unsigned GetIndentLevel() const;
 
   /// Indent the current line in the stream.
   ///
@@ -388,10 +317,10 @@ class Stream {
   size_t Indent(llvm::StringRef s);
 
   /// Decrement the current indentation level.
-  void IndentLess(int amount = 2);
+  void IndentLess(unsigned amount = 2);
 
   /// Increment the current indentation level.
-  void IndentMore(int amount = 2);
+  void IndentMore(unsigned amount = 2);
 
   /// Output an offset value.
   ///
@@ -446,7 +375,7 @@ class Stream {
   ///
   /// \param[in] level
   ///     The new indentation level.
-  void SetIndentLevel(int level);
+  void SetIndentLevel(unsigned level);
 
   /// Output a SLEB128 number to the stream.
   ///
@@ -477,7 +406,7 @@ class Stream {
   uint32_t m_addr_size; ///< Size of an address in bytes.
   lldb::ByteOrder
       m_byte_order;   ///< Byte order to use when encoding scalar types.
-  int m_indent_level; ///< Indention level.
+  unsigned m_indent_level;         ///< Indention level.
   std::size_t m_bytes_written = 0; ///< Number of bytes written so far.
 
   void _PutHex8(uint8_t uvalue, bool add_prefix);
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 3c80bcffec20e..0a92365544f99 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -690,6 +690,7 @@ enum SectionType {
   eSectionTypeDWARFDebugStrDwo,
   eSectionTypeDWARFDebugStrOffsetsDwo,
   eSectionTypeDWARFDebugTypesDwo,
+  eSectionTypeDWARFDebugRngListsDwo,
 };
 
 FLAGS_ENUM(EmulateInstructionOptions){
diff --git a/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/TestBreakpointCommandList.py b/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/TestBreakpointCommandList.py
new file mode 100644
index 0000000000000..f1a8656a73b55
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/TestBreakpointCommandList.py
@@ -0,0 +1,44 @@
+"""
+Test 'breakpoint command list'.
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+class TestCase(TestBase):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    @no_debug_info_test
+    def test_list_commands(self):
+        src_dir = self.getSourceDir()
+        yaml_path = os.path.join(src_dir, "a.yaml")
+        yaml_base, ext = os.path.splitext(yaml_path)
+        obj_path = self.getBuildArtifact("main.o")
+        self.yaml2obj(yaml_path, obj_path)
+
+        # Create a target with the object file we just created from YAML
+        target = self.dbg.CreateTarget(obj_path)
+        self.assertTrue(target, VALID_TARGET)
+
+        # Test without any breakpoints.
+        self.expect("breakpoint command list 1", error=True, substrs=["error: No breakpoints exist for which to list commands"])
+
+        # Set a breakpoint
+        self.runCmd("b foo")
+
+        # Check list breakpoint commands for breakpoints that have no commands.
+        self.expect("breakpoint command list 1", startstr="Breakpoint 1 does not have an associated command.")
+
+        # Add a breakpoint command.
+        self.runCmd("breakpoint command add -o 'source list' 1")
+
+        # List breakpoint command that we just created.
+        self.expect("breakpoint command list 1", startstr="""Breakpoint 1:
+    Breakpoint commands:
+      source list
+""")
+
+        # List breakpoint command with invalid breakpoint ID.
+        self.expect("breakpoint command list 2", error=True, startstr="error: '2' is not a currently valid breakpoint ID.")
diff --git a/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/a.yaml b/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/a.yaml
new file mode 100644
index 0000000000000..1007f60c19ee3
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/breakpoint/command/list/a.yaml
@@ -0,0 +1,18 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000010
+    Content:         554889E5897DFC5DC3
+Symbols:
+  - Name:            foo
+    Type:            STT_FUNC
+    Section:         .text
+    Size:            0x0000000000000009
+...
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/multiline-navigation/TestMultilineNavigation.py b/lldb/packages/Python/lldbsuite/test/commands/expression/multiline-navigation/TestMultilineNavigation.py
new file mode 100644
index 0000000000000..712111209215d
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/multiline-navigation/TestMultilineNavigation.py
@@ -0,0 +1,67 @@
+"""
+Tests navigating in the multiline expression editor.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.lldbpexpect import PExpectTest
+
+class TestCase(PExpectTest):
+
+    mydir = TestBase.compute_mydir(__file__)
+
+    arrow_up = "\033[A"
+    arrow_down = "\033[B"
+
+    # PExpect uses many timeouts internally and doesn't play well
+    # under ASAN on a loaded machine..
+    @skipIfAsan
+    def test_nav_arrow_up(self):
+        """Tests that we can navigate back to the previous line with the up arrow"""
+        self.launch()
+
+        # Start multiline expression mode by just running 'expr'
+        self.child.sendline("expr")
+        self.child.expect_exact("terminate with an empty line to evaluate")
+        # Create a simple integer expression '123' and press enter.
+        self.child.send("123\n")
+        # We should see the prompt for the second line of our expression.
+        self.child.expect_exact("2: ")
+        # Go back to the first line and change 123 to 124.
+        # Then press enter twice to evaluate our expression.
+        self.child.send(self.arrow_up + "\b4\n\n")
+        # The result of our expression should be 124 (our edited expression)
+        # and not 123 (the one we initially typed).
+        self.child.expect_exact("(int) $0 = 124")
+
+        self.quit()
+
+    @skipIfAsan
+    def test_nav_arrow_down(self):
+        """Tests that we can navigate to the next line with the down arrow"""
+        self.launch()
+
+        # Start multiline expression mode by just running 'expr'
+        self.child.sendline("expr")
+        self.child.expect_exact("terminate with an empty line to evaluate")
+        # Create a simple integer expression '111' and press enter.
+        self.child.send("111\n")
+        # We should see the prompt for the second line of our expression.
+        self.child.expect_exact("2: ")
+        # Create another simple integer expression '222'.
+        self.child.send("222")
+        # Go back to the first line and change '111' to '111+' to make
+        # an addition operation that spans two lines. We need to go up to
+        # test that we can go back down again.
+        self.child.send(self.arrow_up + "+")
+        # Go back down to our second line and change '222' to '223'
+        # so that the full expression is now '111+\n223'.
+        # Then press enter twice to evaluate the expression.
+        self.child.send(self.arrow_down + "\b3\n\n")
+        # The result of our expression '111 + 223' should be '334'.
+        # If the expression is '333' then arrow down failed to get
+        # us back to the second line.
+        self.child.expect_exact("(int) $0 = 334")
+
+        self.quit()
diff --git a/lldb/packages/Python/lldbsuite/test/commands/expression/static-initializers/TestStaticInitializers.py b/lldb/packages/Python/lldbsuite/test/commands/expression/static-initializers/TestStaticInitializers.py
index e350e6ef930f9..61107077f9cff 100644
--- a/lldb/packages/Python/lldbsuite/test/commands/expression/static-initializers/TestStaticInitializers.py
+++ b/lldb/packages/Python/lldbsuite/test/commands/expression/static-initializers/TestStaticInitializers.py
@@ -7,6 +7,8 @@ class StaticInitializers(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44053")
     def test(self):
         """ Test a static initializer. """
         self.build()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py
index 4a5ed87e330ff..9513278ba084d 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/breakpoint_names/TestBreakpointNames.py
@@ -155,8 +155,13 @@ def do_check_illegal_names(self):
     def do_check_using_names(self):
         """Use Python APIs to check names work in place of breakpoint ID's."""
 
+        # Create a dummy breakpoint to use up ID 1
+        _ = self.target.BreakpointCreateByLocation(self.main_file_spec, 30)
+
+        # Create a breakpiont to test with
         bkpt = self.target.BreakpointCreateByLocation(self.main_file_spec, 10)
         bkpt_name = "ABreakpoint"
+        bkpt_id = bkpt.GetID()
         other_bkpt_name= "_AnotherBreakpoint"
 
         # Add a name and make sure we match it:
@@ -169,6 +174,7 @@ def do_check_using_names(self):
         self.assertTrue(bkpts.GetSize() == 1, "One breakpoint matched.")
         found_bkpt = bkpts.GetBreakpointAtIndex(0)
         self.assertTrue(bkpt.GetID() == found_bkpt.GetID(),"The right breakpoint.")
+        self.assertTrue(bkpt.GetID() == bkpt_id,"With the same ID as before.")
 
         retval = lldb.SBCommandReturnObject()
         self.dbg.GetCommandInterpreter().HandleCommand("break disable %s"%(bkpt_name), retval)
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py
index 020974ee469a8..4a571787f0118 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py
@@ -26,6 +26,8 @@ def test_breakpoint(self):
         self.assertTrue(breakpoint.IsHardware())
 
     @skipIfWindows
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44055")
     def test_step_range(self):
         """Test stepping when hardware breakpoints are required."""
         self.build()
@@ -47,6 +49,8 @@ def test_step_range(self):
                         in error.GetCString())
 
     @skipIfWindows
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44055")
     def test_step_out(self):
         """Test stepping out when hardware breakpoints are required."""
         self.build()
@@ -67,6 +71,8 @@ def test_step_out(self):
                         in error.GetCString())
 
     @skipIfWindows
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44055")
     def test_step_over(self):
         """Test stepping over when hardware breakpoints are required."""
         self.build()
@@ -85,6 +91,8 @@ def test_step_over(self):
             ])
 
     @skipIfWindows
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44055")
     def test_step_until(self):
         """Test stepping until when hardware breakpoints are required."""
         self.build()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
index 4842bc0945519..817d7de6bb960 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/breakpoint/scripted_bkpt/TestScriptedResolver.py
@@ -33,8 +33,7 @@ def test_search_depths(self):
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24528")
     def test_command_line(self):
-        """ Make sure we are called at the right depths depending on what we return
-            from __get_depth__"""
+        """ Test setting a resolver breakpoint from the command line """
         self.build()
         self.do_test_cli()
 
@@ -202,6 +201,23 @@ def do_test_cli(self):
 
         lldbutil.run_break_set_by_script(self, "resolver.Resolver", extra_options="-k symbol -v break_on_me")
 
+        # Make sure setting a resolver breakpoint doesn't pollute further breakpoint setting
+        # by checking the description of a regular file & line breakpoint to make sure it
+        # doesn't mention the Python Resolver function:
+        bkpt_no = lldbutil.run_break_set_by_file_and_line(self, "main.c", 12)
+        bkpt = target.FindBreakpointByID(bkpt_no)
+        strm = lldb.SBStream()
+        bkpt.GetDescription(strm, False)
+        used_resolver = "I am a python breakpoint resolver" in strm.GetData()
+        self.assertFalse(used_resolver, "Found the resolver description in the file & line breakpoint description.")
+
+        # Also make sure the breakpoint was where we expected:
+        bp_loc = bkpt.GetLocationAtIndex(0)
+        bp_sc = bp_loc.GetAddress().GetSymbolContext(lldb.eSymbolContextEverything)
+        bp_se = bp_sc.GetLineEntry()
+        self.assertEqual(bp_se.GetLine(), 12, "Got the right line number")
+        self.assertEqual(bp_se.GetFileSpec().GetFilename(), "main.c", "Got the right filename")
+        
     def do_test_bad_options(self):
         target = self.make_target_and_import()
 
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/TestTargetXMLArch.py b/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/TestTargetXMLArch.py
index 9ea7cc8a4c7ea..20e575ae978b0 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/TestTargetXMLArch.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/TestTargetXMLArch.py
@@ -4,6 +4,101 @@
 from lldbsuite.test.decorators import *
 from gdbclientutils import *
 
+class MyResponder(MockGDBServerResponder):
+    def qXferRead(self, obj, annex, offset, length):
+        if annex == "target.xml":
+            return """<?xml version="1.0"?>
+                <target version="1.0">
+                  <architecture>i386:x86-64</architecture>
+                  <feature name="org.gnu.gdb.i386.core">
+
+                 <flags id="i386_eflags" size="4">
+                 <field name="CF" start="0" end="0"/>
+                 <field name="" start="1" end="1"/>
+                 <field name="PF" start="2" end="2"/>
+                 <field name="AF" start="4" end="4"/>
+                 <field name="ZF" start="6" end="6"/>
+                 <field name="SF" start="7" end="7"/>
+                 <field name="TF" start="8" end="8"/>
+                 <field name="IF" start="9" end="9"/>
+                 <field name="DF" start="10" end="10"/>
+                 <field name="OF" start="11" end="11"/>
+                 <field name="NT" start="14" end="14"/>
+                 <field name="RF" start="16" end="16"/>
+                 <field name="VM" start="17" end="17"/>
+                 <field name="AC" start="18" end="18"/>
+                 <field name="VIF" start="19" end="19"/>
+                 <field name="VIP" start="20" end="20"/>
+                 <field name="ID" start="21" end="21"/>
+                 </flags>
+
+                    <reg name="rax" bitsize="64" regnum="0" type="int" group="general"/>
+                    <reg name="rbx" bitsize="64" regnum="1" type="int" group="general"/>
+                    <reg name="rcx" bitsize="64" regnum="2" type="int" group="general"/>
+                    <reg name="rdx" bitsize="64" regnum="3" type="int" group="general"/>
+                    <reg name="rsi" bitsize="64" regnum="4" type="int" group="general"/>
+                    <reg name="rdi" bitsize="64" regnum="5" type="int" group="general"/>
+                    <reg name="rbp" bitsize="64" regnum="6" type="data_ptr" group="general"/>
+                    <reg name="rsp" bitsize="64" regnum="7" type="data_ptr" group="general"/>
+                    <reg name="r8" bitsize="64"  regnum="8" type="int" group="general"/>
+                    <reg name="r9" bitsize="64"  regnum="9" type="int" group="general"/>
+                    <reg name="r10" bitsize="64" regnum="10" type="int" group="general"/>
+                    <reg name="r11" bitsize="64" regnum="11" type="int" group="general"/>
+                    <reg name="r12" bitsize="64" regnum="12" type="int" group="general"/>
+                    <reg name="r13" bitsize="64" regnum="13" type="int" group="general"/>
+                    <reg name="r14" bitsize="64" regnum="14" type="int" group="general"/>
+                    <reg name="r15" bitsize="64" regnum="15" type="int" group="general"/>
+                    <reg name="rip" bitsize="64" regnum="16" type="code_ptr" group="general"/>
+                    <reg name="eflags" bitsize="32" regnum="17" type="i386_eflags" group="general"/>
+
+                    <reg name="cs" bitsize="32" regnum="18" type="int" group="general"/>
+                    <reg name="ss" bitsize="32" regnum="19" type="int" group="general"/>
+                    <reg name="ds" bitsize="32" regnum="20" type="int" group="general"/>
+                    <reg name="es" bitsize="32" regnum="21" type="int" group="general"/>
+                    <reg name="fs" bitsize="32" regnum="22" type="int" group="general"/>
+                    <reg name="gs" bitsize="32" regnum="23" type="int" group="general"/>
+
+                    <reg name="st0" bitsize="80" regnum="24" type="i387_ext" group="float"/>
+                    <reg name="st1" bitsize="80" regnum="25" type="i387_ext" group="float"/>
+                    <reg name="st2" bitsize="80" regnum="26" type="i387_ext" group="float"/>
+                    <reg name="st3" bitsize="80" regnum="27" type="i387_ext" group="float"/>
+                    <reg name="st4" bitsize="80" regnum="28" type="i387_ext" group="float"/>
+                    <reg name="st5" bitsize="80" regnum="29" type="i387_ext" group="float"/>
+                    <reg name="st6" bitsize="80" regnum="30" type="i387_ext" group="float"/>
+                    <reg name="st7" bitsize="80" regnum="31" type="i387_ext" group="float"/>
+
+                    <reg name="fctrl" bitsize="32" regnum="32" type="int" group="float"/>
+                    <reg name="fstat" bitsize="32" regnum="33" type="int" group="float"/>
+                    <reg name="ftag"  bitsize="32" regnum="34" type="int" group="float"/>
+                    <reg name="fiseg" bitsize="32" regnum="35" type="int" group="float"/>
+                    <reg name="fioff" bitsize="32" regnum="36" type="int" group="float"/>
+                    <reg name="foseg" bitsize="32" regnum="37" type="int" group="float"/>
+                    <reg name="fooff" bitsize="32" regnum="38" type="int" group="float"/>
+                    <reg name="fop"   bitsize="32" regnum="39" type="int" group="float"/>
+                  </feature>
+                </target>""", False
+        else:
+            return None, False
+
+    def qC(self):
+        return "QC1"
+
+    def haltReason(self):
+        return "T05thread:00000001;06:9038d60f00700000;07:98b4062680ffffff;10:c0d7bf1b80ffffff;"
+
+    def readRegister(self, register):
+        regs = {0x0: "00b0060000610000",
+                0xa: "68fe471c80ffffff",
+                0xc: "60574a1c80ffffff",
+                0xd: "18f3042680ffffff",
+                0xe: "be8a4d7142000000",
+                0xf: "50df471c80ffffff",
+                0x10: "c0d7bf1b80ffffff" }
+        if register in regs:
+            return regs[register]
+        else:
+            return "0000000000000000"
+
 class TestTargetXMLArch(GDBRemoteTestBase):
 
     @skipIfXmlSupportMissing
@@ -14,102 +109,6 @@ def test(self):
         Test lldb's parsing of the <architecture> tag in the target.xml register
         description packet.
         """
-        class MyResponder(MockGDBServerResponder):
-
-            def qXferRead(self, obj, annex, offset, length):
-                if annex == "target.xml":
-                    return """<?xml version="1.0"?>
-                        <target version="1.0">
-                          <architecture>i386:x86-64</architecture>
-                          <feature name="org.gnu.gdb.i386.core">
-    
-                         <flags id="i386_eflags" size="4">
-                         <field name="CF" start="0" end="0"/>
-                         <field name="" start="1" end="1"/>
-                         <field name="PF" start="2" end="2"/>
-                         <field name="AF" start="4" end="4"/>
-                         <field name="ZF" start="6" end="6"/>
-                         <field name="SF" start="7" end="7"/>
-                         <field name="TF" start="8" end="8"/>
-                         <field name="IF" start="9" end="9"/>
-                         <field name="DF" start="10" end="10"/>
-                         <field name="OF" start="11" end="11"/>
-                         <field name="NT" start="14" end="14"/>
-                         <field name="RF" start="16" end="16"/>
-                         <field name="VM" start="17" end="17"/>
-                         <field name="AC" start="18" end="18"/>
-                         <field name="VIF" start="19" end="19"/>
-                         <field name="VIP" start="20" end="20"/>
-                         <field name="ID" start="21" end="21"/>
-                         </flags>
-    
-                            <reg name="rax" bitsize="64" regnum="0" type="int" group="general"/>
-                            <reg name="rbx" bitsize="64" regnum="1" type="int" group="general"/>
-                            <reg name="rcx" bitsize="64" regnum="2" type="int" group="general"/>
-                            <reg name="rdx" bitsize="64" regnum="3" type="int" group="general"/>
-                            <reg name="rsi" bitsize="64" regnum="4" type="int" group="general"/>
-                            <reg name="rdi" bitsize="64" regnum="5" type="int" group="general"/>
-                            <reg name="rbp" bitsize="64" regnum="6" type="data_ptr" group="general"/>
-                            <reg name="rsp" bitsize="64" regnum="7" type="data_ptr" group="general"/>
-                            <reg name="r8" bitsize="64"  regnum="8" type="int" group="general"/>
-                            <reg name="r9" bitsize="64"  regnum="9" type="int" group="general"/>
-                            <reg name="r10" bitsize="64" regnum="10" type="int" group="general"/>
-                            <reg name="r11" bitsize="64" regnum="11" type="int" group="general"/>
-                            <reg name="r12" bitsize="64" regnum="12" type="int" group="general"/>
-                            <reg name="r13" bitsize="64" regnum="13" type="int" group="general"/>
-                            <reg name="r14" bitsize="64" regnum="14" type="int" group="general"/>
-                            <reg name="r15" bitsize="64" regnum="15" type="int" group="general"/>
-                            <reg name="rip" bitsize="64" regnum="16" type="code_ptr" group="general"/>
-                            <reg name="eflags" bitsize="32" regnum="17" type="i386_eflags" group="general"/>
-    
-                            <reg name="cs" bitsize="32" regnum="18" type="int" group="general"/>
-                            <reg name="ss" bitsize="32" regnum="19" type="int" group="general"/>
-                            <reg name="ds" bitsize="32" regnum="20" type="int" group="general"/>
-                            <reg name="es" bitsize="32" regnum="21" type="int" group="general"/>
-                            <reg name="fs" bitsize="32" regnum="22" type="int" group="general"/>
-                            <reg name="gs" bitsize="32" regnum="23" type="int" group="general"/>
-    
-                            <reg name="st0" bitsize="80" regnum="24" type="i387_ext" group="float"/>
-                            <reg name="st1" bitsize="80" regnum="25" type="i387_ext" group="float"/>
-                            <reg name="st2" bitsize="80" regnum="26" type="i387_ext" group="float"/>
-                            <reg name="st3" bitsize="80" regnum="27" type="i387_ext" group="float"/>
-                            <reg name="st4" bitsize="80" regnum="28" type="i387_ext" group="float"/>
-                            <reg name="st5" bitsize="80" regnum="29" type="i387_ext" group="float"/>
-                            <reg name="st6" bitsize="80" regnum="30" type="i387_ext" group="float"/>
-                            <reg name="st7" bitsize="80" regnum="31" type="i387_ext" group="float"/>
-    
-                            <reg name="fctrl" bitsize="32" regnum="32" type="int" group="float"/>
-                            <reg name="fstat" bitsize="32" regnum="33" type="int" group="float"/>
-                            <reg name="ftag"  bitsize="32" regnum="34" type="int" group="float"/>
-                            <reg name="fiseg" bitsize="32" regnum="35" type="int" group="float"/>
-                            <reg name="fioff" bitsize="32" regnum="36" type="int" group="float"/>
-                            <reg name="foseg" bitsize="32" regnum="37" type="int" group="float"/>
-                            <reg name="fooff" bitsize="32" regnum="38" type="int" group="float"/>
-                            <reg name="fop"   bitsize="32" regnum="39" type="int" group="float"/>
-                          </feature>
-                        </target>""", False
-                else:
-                    return None, False
-
-            def qC(self):
-                return "QC1"
-
-            def haltReason(self):
-                return "T05thread:00000001;06:9038d60f00700000;07:98b4062680ffffff;10:c0d7bf1b80ffffff;"
-
-            def readRegister(self, register):
-                regs = {0x0: "00b0060000610000",
-                        0xa: "68fe471c80ffffff",
-                        0xc: "60574a1c80ffffff",
-                        0xd: "18f3042680ffffff",
-                        0xe: "be8a4d7142000000",
-                        0xf: "50df471c80ffffff",
-                        0x10: "c0d7bf1b80ffffff" }
-                if register in regs:
-                    return regs[register]
-                else:
-                    return "0000000000000000"
-
         self.server.responder = MyResponder()
         interp = self.dbg.GetCommandInterpreter()
         result = lldb.SBCommandReturnObject()
@@ -125,3 +124,22 @@ def readRegister(self, register):
             interp.HandleCommand("target list", result)
             print(result.GetOutput())
         self.assertTrue(target.GetTriple().startswith('x86_64-unknown-unknown'))
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    def test_register_augmentation(self):
+        """
+        Test that we correctly associate the register info with the eh_frame
+        register numbers.
+        """
+
+        target = self.createTarget("basic_eh_frame.yaml")
+        self.server.responder = MyResponder()
+
+        process = self.connect(target)
+        lldbutil.expect_state_changes(self, self.dbg.GetListener(), process,
+                [lldb.eStateStopped])
+        self.filecheck("image show-unwind -n foo", __file__,
+            "--check-prefix=UNWIND")
+# UNWIND: eh_frame UnwindPlan:
+# UNWIND: row[0]:    0: CFA=rsp+128 => rip=[CFA-8]
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/basic_eh_frame.yaml b/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/basic_eh_frame.yaml
new file mode 100644
index 0000000000000..384b9b992b407
--- /dev/null
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/gdb_remote_client/basic_eh_frame.yaml
@@ -0,0 +1,48 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_X86_64
+  Entry:           0x0000000000401000
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    Address:         0x0000000000401000
+    AddressAlign:    0x0000000000000001
+    Content:         C3
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    Address:         0x0000000000402000
+    AddressAlign:    0x0000000000000008
+    Content:         1800000000000000017A5200017810011B0C070890010E80010000001000000020000000DCEFFFFF0100000000000000
+Symbols:
+  - Name:            .text
+    Type:            STT_SECTION
+    Section:         .text
+    Value:           0x0000000000401000
+  - Name:            .eh_frame
+    Type:            STT_SECTION
+    Section:         .eh_frame
+    Value:           0x0000000000402000
+  - Name:            _start
+    Binding:         STB_GLOBAL
+  - Name:            __bss_start
+    Section:         .eh_frame
+    Binding:         STB_GLOBAL
+    Value:           0x0000000000404000
+  - Name:            foo
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x0000000000401000
+  - Name:            _edata
+    Section:         .eh_frame
+    Binding:         STB_GLOBAL
+    Value:           0x0000000000404000
+  - Name:            _end
+    Section:         .eh_frame
+    Binding:         STB_GLOBAL
+    Value:           0x0000000000404000
+...
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/inline-stepping/TestInlineStepping.py b/lldb/packages/Python/lldbsuite/test/functionalities/inline-stepping/TestInlineStepping.py
index a52cd4dd68653..ce4572361d931 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/inline-stepping/TestInlineStepping.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/inline-stepping/TestInlineStepping.py
@@ -18,6 +18,8 @@ class TestInlineStepping(TestBase):
         compiler="icc",
         bugnumber="# Not really a bug.  ICC combines two inlined functions.")
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343")
+    @expectedFailureAll(archs=["aarch64"], oslist=["linux"],
+                        bugnumber="llvm.org/pr44057")
     def test_with_python_api(self):
         """Test stepping over and into inlined functions."""
         self.build()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/load_unload/TestLoadUnload.py b/lldb/packages/Python/lldbsuite/test/functionalities/load_unload/TestLoadUnload.py
index 02e9198e38dff..ae0934c746162 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/load_unload/TestLoadUnload.py
@@ -363,12 +363,16 @@ def run_load_unload(self):
 
     @skipIfFreeBSD  # llvm.org/pr14424 - missing FreeBSD Makefiles/testcase support
     @skipIfWindows  # Windows doesn't have dlopen and friends, dynamic libraries work differently
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=27806")
     def test_step_over_load(self):
         self.setSvr4Support(False)
         self.run_step_over_load()
 
     @skipIfFreeBSD  # llvm.org/pr14424 - missing FreeBSD Makefiles/testcase support
     @skipIfWindows  # Windows doesn't have dlopen and friends, dynamic libraries work differently
+    @expectedFailureAll(archs="aarch64", oslist="linux",
+                        bugnumber="https://bugs.llvm.org/show_bug.cgi?id=27806")
     def test_step_over_load_with_svr4(self):
         self.setSvr4Support(True)
         self.run_step_over_load()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile
index 1adf3fc44a694..a49ffa84c5478 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile
@@ -1,5 +1,5 @@
 LEVEL = ../../../make
 CXX_SOURCES := main.cpp
 include $(LEVEL)/Makefile.rules
-CXXFLAGS_EXTRAS := -O1 -glldb -Xclang -femit-debug-entry-values
+CXXFLAGS_EXTRAS := -O2 -glldb -Xclang -femit-debug-entry-values
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/TestBasicEntryValuesX86_64.py b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/TestBasicEntryValuesX86_64.py
index 1192c2b672f6d..e0285e6d626d8 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/TestBasicEntryValuesX86_64.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/TestBasicEntryValuesX86_64.py
@@ -6,8 +6,7 @@
 supported_platforms.extend(lldbplatformutil.getDarwinOSTriples())
 
 lldbinline.MakeInlineTest(__file__, globals(),
-        [decorators.skipIf(bugnumber="llvm.org/pr44059"),
-         decorators.skipUnlessPlatform(supported_platforms),
+        [decorators.skipUnlessPlatform(supported_platforms),
          decorators.skipIf(compiler="clang", compiler_version=['<', '10.0']),
          decorators.skipUnlessArch('x86_64'),
          decorators.skipUnlessHasCallSiteInfo,
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/main.cpp b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/main.cpp
index ff72a81c6b293..9aac6e947838e 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/main.cpp
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/param_entry_vals/basic_entry_values_x86_64/main.cpp
@@ -18,6 +18,14 @@ template<typename T> __attribute__((noinline)) void use(T x) {
       /* Clobbers */ : "rsi" \
   );
 
+// Destroy %rbx in the current frame.
+#define DESTROY_RBX \
+  asm volatile ("xorq %%rbx, %%rbx" \
+      /* Outputs */  : \
+      /* Inputs */   : \
+      /* Clobbers */ : "rbx" \
+  );
+
 struct S1 {
   int field1 = 123;
   int *field2 = &field1;
@@ -30,10 +38,17 @@ void func1(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("image lookup -va $pc", "main.cpp", "-check-prefix=FUNC1-DESC")
-  // FUNC1-DESC: name = "x", type = "int", location = DW_OP_entry_value(DW_OP_reg4 RSI)
+  // NOTE: Currently, we do not generate DW_OP_entry_value for the 'x',
+  // since it gets copied into a register that is not callee saved,
+  // and we can not guarantee that its value has not changed.
 
   ++sink;
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("image lookup -va $pc", "main.cpp", "-check-prefix=FUNC1-DESC")
+  // FUNC1-DESC: name = "sink", type = "int &", location = DW_OP_entry_value(DW_OP_reg5 RDI)
 }
 
 __attribute__((noinline))
@@ -43,10 +58,16 @@ void func2(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC2-EXPR")
-  // FUNC2-EXPR: (int) ${{.*}} = 123
+  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC2-EXPR-FAIL", expect_cmd_failure=True)
+  // FUNC2-EXPR-FAIL: couldn't get the value of variable x: variable not available
 
   ++sink;
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("expr sink", "main.cpp", "-check-prefix=FUNC2-EXPR")
+  // FUNC2-EXPR: ${{.*}} = 2
 }
 
 __attribute__((noinline))
@@ -69,10 +90,16 @@ void func4_amb(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC4-EXPR", expect_cmd_failure=True)
-  // FUNC4-EXPR: couldn't get the value of variable x: Could not evaluate DW_OP_entry_value.
+  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC4-EXPR-FAIL", expect_cmd_failure=True)
+  // FUNC4-EXPR-FAIL: couldn't get the value of variable x: variable not available
 
   ++sink;
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("expr sink", "main.cpp", "-check-prefix=FUNC4-EXPR", expect_cmd_failure=True)
+  // FUNC4-EXPR: couldn't get the value of variable sink: Could not evaluate DW_OP_entry_value.
 }
 
 __attribute__((noinline))
@@ -98,10 +125,16 @@ void func7(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC7-EXPR")
-  // FUNC7-EXPR: (int) ${{.*}} = 123
+  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC7-EXPR-FAIL", expect_cmd_failure=True)
+  // FUNC7-EXPR-FAIL: couldn't get the value of variable x: variable not available
 
   ++sink;
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("expr sink", "main.cpp", "-check-prefix=FUNC7-EXPR")
+  // FUNC7-EXPR: ${{.*}} = 4
 }
 
 __attribute__((always_inline))
@@ -129,10 +162,16 @@ void func11_tailcalled(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC11-EXPR")
-  // FUNC11-EXPR: (int) ${{.*}} = 123
+  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC11-EXPR-FAIL", expect_cmd_failure=True)
+  // FUNC11-EXPR-FAIL: couldn't get the value of variable x: variable not available
 
   ++sink;
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("expr sink", "main.cpp", "-check-prefix=FUNC11-EXPR")
+  // FUNC11-EXPR: ${{.*}} = 5
 }
 
 __attribute__((noinline))
@@ -150,10 +189,16 @@ void func13(int &sink, int x) {
   // Destroy 'x' in the current frame.
   DESTROY_RSI;
 
-  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC13-EXPR")
-  // FUNC13-EXPR: (int) ${{.*}} = 123
+  //% self.filecheck("expr x", "main.cpp", "-check-prefix=FUNC13-EXPR-FAIL", expect_cmd_failure=True)
+  // FUNC13-EXPR-FAIL: couldn't get the value of variable x: variable not available
 
-  ++sink;
+  use(sink);
+
+  // Destroy 'sink' in the current frame.
+  DESTROY_RBX;
+
+  //% self.filecheck("expr sink", "main.cpp", "-check-prefix=FUNC13-EXPR")
+  // FUNC13-EXPR: ${{.*}} = 5
 }
 
 __attribute__((noinline, disable_tail_calls))
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py b/lldb/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
index cb5dad50df8f7..e84bbc3c245d6 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py
@@ -19,6 +19,9 @@ def affected_by_pr33042(self):
         return ("clang" in self.getCompiler() and self.getArchitecture() ==
             "aarch64" and self.getPlatform() == "linux")
 
+    def affected_by_pr44132(self):
+        return (self.getArchitecture() == "aarch64" and self.getPlatform() == "linux")
+
     # ABIMacOSX_arm can't fetch simple values inside a structure
     def affected_by_radar_34562999(self):
         return (self.getArchitecture() == 'armv7' or self.getArchitecture() == 'armv7k') and self.platformIsDarwin()
@@ -123,7 +126,7 @@ def test_with_python(self):
 
         #self.assertTrue(in_float == return_float)
 
-        if not self.affected_by_radar_34562999():
+        if not self.affected_by_radar_34562999() and not self.affected_by_pr44132():
             self.return_and_test_struct_value("return_one_int")
             self.return_and_test_struct_value("return_two_int")
             self.return_and_test_struct_value("return_three_int")
@@ -182,10 +185,12 @@ def test_vector_values(self):
 
         self.return_and_test_struct_value("return_vector_size_float32_8")
         self.return_and_test_struct_value("return_vector_size_float32_16")
-        self.return_and_test_struct_value("return_vector_size_float32_32")
+        if not self.affected_by_pr44132():
+            self.return_and_test_struct_value("return_vector_size_float32_32")
         self.return_and_test_struct_value("return_ext_vector_size_float32_2")
         self.return_and_test_struct_value("return_ext_vector_size_float32_4")
-        self.return_and_test_struct_value("return_ext_vector_size_float32_8")
+        if not self.affected_by_pr44132():
+            self.return_and_test_struct_value("return_ext_vector_size_float32_8")
 
     # limit the nested struct and class tests to only x86_64
     @skipIf(archs=no_match(['x86_64']))
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/show_location/TestShowLocationDwarf5.py b/lldb/packages/Python/lldbsuite/test/functionalities/show_location/TestShowLocationDwarf5.py
index a56282efd77db..1d4bc6f134500 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/show_location/TestShowLocationDwarf5.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/show_location/TestShowLocationDwarf5.py
@@ -9,6 +9,8 @@ class TestTargetSourceMap(TestBase):
 
     mydir = TestBase.compute_mydir(__file__)
 
+    @skipIf(archs="aarch64", oslist="linux",
+            bugnumber="https://bugs.llvm.org/show_bug.cgi?id=44180")
     def test_source_map(self):
         # Set the target soure map to map "./" to the current test directory.
         yaml_path = os.path.join(self.getSourceDir(), "a.yaml")
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/step-avoids-no-debug/TestStepNoDebug.py b/lldb/packages/Python/lldbsuite/test/functionalities/step-avoids-no-debug/TestStepNoDebug.py
index c4fae7da0135a..ceee901fe3063 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/step-avoids-no-debug/TestStepNoDebug.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/step-avoids-no-debug/TestStepNoDebug.py
@@ -50,6 +50,8 @@ def test_step_over_with_python(self):
         archs=["i386"],
         bugnumber="llvm.org/pr28549")
     @expectedFailureAll(oslist=["ios", "tvos", "bridgeos"], bugnumber="<rdar://problem/34026777>")  # lldb doesn't step past last source line in function on arm64
+    @expectedFailureAll(archs=["aarch64"], oslist=["linux"],
+                        bugnumber="llvm.org/pr44057")
     def test_step_in_with_python(self):
         """Test stepping in using avoid-no-debug with dwarf."""
         self.build()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq1/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/ambiguous_tail_call_seq2/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_call_site/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_paths_to_common_sink/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/disambiguate_tail_call_seq/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/inlining_and_tail_calls/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/sbapi_support/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_message/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/TestSteppingOutWithArtificialFrames.py b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/TestSteppingOutWithArtificialFrames.py
index 2b432e56a7405..687fb0e7a5e86 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/TestSteppingOutWithArtificialFrames.py
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/thread_step_out_or_return/TestSteppingOutWithArtificialFrames.py
@@ -71,6 +71,8 @@ def test_stepping_out_past_artificial_frame(self):
         self.assertFalse(frame2.IsArtificial())
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr26265")
+    @expectedFailureAll(archs=["aarch64"], oslist=["linux"],
+                        bugnumber="llvm.org/pr44160")
     def test_return_past_artificial_frame(self):
         self.build()
         thread = self.prepare_thread()
diff --git a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile
index 48342e8e3afb9..666a6c3655460 100644
--- a/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile
+++ b/lldb/packages/Python/lldbsuite/test/functionalities/tail_call_frames/unambiguous_sequence/Makefile
@@ -1,4 +1,4 @@
 CXX_SOURCES := main.cpp
 
-CXXFLAGS_EXTRAS := -g -O1 -glldb
+CXXFLAGS_EXTRAS := -g -O2 -glldb
 include Makefile.rules
diff --git a/lldb/packages/Python/lldbsuite/test/lang/c/step-target/TestStepTarget.py b/lldb/packages/Python/lldbsuite/test/lang/c/step-target/TestStepTarget.py
index c694bda97c282..b3786fb94454f 100644
--- a/lldb/packages/Python/lldbsuite/test/lang/c/step-target/TestStepTarget.py
+++ b/lldb/packages/Python/lldbsuite/test/lang/c/step-target/TestStepTarget.py
@@ -32,7 +32,7 @@ def get_to_start(self):
         break_in_main = target.BreakpointCreateBySourceRegex(
             'Break here to try targetted stepping', self.main_source_spec)
         self.assertTrue(break_in_main, VALID_BREAKPOINT)
-        self.assertTrue(break_in_main.GetNumLocations() > 0, "Has locations.")
+        self.assertGreater(break_in_main.GetNumLocations(), 0, "Has locations.")
 
         # Now launch the process, and do not stop at entry point.
         process = target.LaunchSimple(
@@ -60,7 +60,7 @@ def test_with_end_line(self):
         thread.StepInto("lotsOfArgs", self.end_line, error)
         frame = thread.frames[0]
 
-        self.assertTrue(frame.name == "lotsOfArgs", "Stepped to lotsOfArgs.")
+        self.assertEqual(frame.name, "lotsOfArgs", "Stepped to lotsOfArgs.")
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343")
     def test_with_end_line_bad_name(self):
@@ -71,8 +71,7 @@ def test_with_end_line_bad_name(self):
         error = lldb.SBError()
         thread.StepInto("lotsOfArgssss", self.end_line, error)
         frame = thread.frames[0]
-        self.assertTrue(
-            frame.line_entry.line == self.end_line,
+        self.assertEqual(frame.line_entry.line, self.end_line,
             "Stepped to the block end.")
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343")
@@ -84,7 +83,7 @@ def test_with_end_line_deeper(self):
         error = lldb.SBError()
         thread.StepInto("modifyInt", self.end_line, error)
         frame = thread.frames[0]
-        self.assertTrue(frame.name == "modifyInt", "Stepped to modifyInt.")
+        self.assertEqual(frame.name, "modifyInt", "Stepped to modifyInt.")
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343")
     def test_with_command_and_block(self):
@@ -100,7 +99,7 @@ def test_with_command_and_block(self):
             "thread step-in command succeeded.")
 
         frame = thread.frames[0]
-        self.assertTrue(frame.name == "lotsOfArgs", "Stepped to lotsOfArgs.")
+        self.assertEqual(frame.name, "lotsOfArgs", "Stepped to lotsOfArgs.")
 
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343")
     def test_with_command_and_block_and_bad_name(self):
@@ -117,9 +116,8 @@ def test_with_command_and_block_and_bad_name(self):
 
         frame = thread.frames[0]
 
-        self.assertTrue(frame.name == "main", "Stepped back out to main.")
+        self.assertEqual(frame.name, "main", "Stepped back out to main.")
         # end_line is set to the line after the containing block.  Check that
         # we got there:
-        self.assertTrue(
-            frame.line_entry.line == self.end_line,
+        self.assertEqual(frame.line_entry.line, self.end_line,
             "Got out of the block")
diff --git a/lldb/packages/Python/lldbsuite/test/lang/cpp/trivial_abi/TestTrivialABI.py b/lldb/packages/Python/lldbsuite/test/lang/cpp/trivial_abi/TestTrivialABI.py
index 9a203ef3a3088..78f7fa3afd73b 100644
--- a/lldb/packages/Python/lldbsuite/test/lang/cpp/trivial_abi/TestTrivialABI.py
+++ b/lldb/packages/Python/lldbsuite/test/lang/cpp/trivial_abi/TestTrivialABI.py
@@ -18,6 +18,8 @@ class TestTrivialABI(TestBase):
 
     @skipUnlessSupportedTypeAttribute("trivial_abi")
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr37995")
+    @expectedFailureAll(archs=["aarch64"], oslist=["linux"],
+                        bugnumber="llvm.org/pr44161")
     def test_call_trivial(self):
         """Test that we can print a variable & call a function with a trivial ABI class."""
         self.build()
@@ -27,6 +29,8 @@ def test_call_trivial(self):
     @skipUnlessSupportedTypeAttribute("trivial_abi")
     # fixed for SysV-x86_64 ABI, but not Windows-x86_64
     @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr36870")
+    @expectedFailureAll(archs=["aarch64"], oslist=["linux"],
+                        bugnumber="llvm.org/pr44161")
     def test_call_nontrivial(self):
         """Test that we can print a variable & call a function on the same class w/o the trivial ABI marker."""
         self.build()
diff --git a/lldb/packages/Python/lldbsuite/test/linux/builtin_trap/TestBuiltinTrap.py b/lldb/packages/Python/lldbsuite/test/linux/builtin_trap/TestBuiltinTrap.py
index 951f59e611e2f..28debcee9da1a 100644
--- a/lldb/packages/Python/lldbsuite/test/linux/builtin_trap/TestBuiltinTrap.py
+++ b/lldb/packages/Python/lldbsuite/test/linux/builtin_trap/TestBuiltinTrap.py
@@ -24,7 +24,7 @@ def setUp(self):
 
     # gcc generates incorrect linetable
     @expectedFailureAll(archs="arm", compiler="gcc", triple=".*-android")
-    @expectedFailureAll(oslist=['linux'], archs=['arm'])
+    @expectedFailureAll(oslist=['linux'], archs=['arm', 'aarch64'])
     @skipIfWindows
     def test_with_run_command(self):
         """Test that LLDB handles a function with __builtin_trap correctly."""
diff --git a/lldb/packages/Python/lldbsuite/test/python_api/formatters/TestFormattersSBAPI.py b/lldb/packages/Python/lldbsuite/test/python_api/formatters/TestFormattersSBAPI.py
index 1bc52b3e66769..5c87d74e22d22 100644
--- a/lldb/packages/Python/lldbsuite/test/python_api/formatters/TestFormattersSBAPI.py
+++ b/lldb/packages/Python/lldbsuite/test/python_api/formatters/TestFormattersSBAPI.py
@@ -68,17 +68,17 @@ def cleanup():
         self.expect("frame variable foo.E",
                     substrs=['b8cca70a'])
 
-        format.format = lldb.eFormatOctal
+        format.SetFormat(lldb.eFormatOctal)
         category.AddTypeFormat(lldb.SBTypeNameSpecifier("int"), format)
         self.expect("frame variable foo.A",
-                    substrs=['01'])
+                    substrs=[' 01'])
         self.expect("frame variable foo.E",
                     substrs=['b8cca70a'])
 
         category.DeleteTypeFormat(lldb.SBTypeNameSpecifier("int"))
         category.DeleteTypeFormat(lldb.SBTypeNameSpecifier("long"))
         self.expect("frame variable foo.A", matching=False,
-                    substrs=['01'])
+                    substrs=[' 01'])
         self.expect("frame variable foo.E", matching=False,
                     substrs=['b8cca70a'])
 
@@ -90,10 +90,13 @@ def cleanup():
             new_category.IsValid(),
             "getting a non-existing category worked")
         new_category = self.dbg.CreateCategory("foobar")
-        new_category.enabled = True
+        new_category.SetEnabled(True)
         new_category.AddTypeSummary(
             lldb.SBTypeNameSpecifier(
-                "^.*t$", True), summary)
+                "^.*t$",
+                True,  # is_regexp
+            ), summary)
+
         self.expect("frame variable foo.A",
                     substrs=['hello world'])
         self.expect("frame variable foo.E", matching=False,
@@ -102,7 +105,7 @@ def cleanup():
                     substrs=['hello world'])
         self.expect("frame variable foo.F",
                     substrs=['hello world'])
-        new_category.enabled = False
+        new_category.SetEnabled(False)
         self.expect("frame variable foo.A", matching=False,
                     substrs=['hello world'])
         self.expect("frame variable foo.E", matching=False,
@@ -379,7 +382,7 @@ def cleanup():
             lldb.SBTypeSummary.CreateWithScriptCode("return 'hello scripted world';"))
         self.expect("frame variable foo", matching=False,
                     substrs=['hello scripted world'])
-        new_category.enabled = True
+        new_category.SetEnabled(True)
         self.expect("frame variable foo", matching=True,
                     substrs=['hello scripted world'])
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/TestLldbGdbServer.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/TestLldbGdbServer.py
index 7a39079b472a8..2b7f28a3aefbc 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-server/TestLldbGdbServer.py
@@ -675,7 +675,6 @@ def test_Hg_switches_to_3_threads_launch_debugserver(self):
         self.Hg_switches_to_3_threads()
 
     @expectedFailureAll(oslist=["windows"]) # expect 4 threads
-    @expectedFailureNetBSD
     @llgs_test
     def test_Hg_switches_to_3_threads_launch_llgs(self):
         self.init_llgs_test()
@@ -1583,7 +1582,6 @@ def test_P_and_p_thread_suffix_work_debugserver(self):
         self.P_and_p_thread_suffix_work()
 
     @skipIfWindows
-    @expectedFailureNetBSD
     @llgs_test
     def test_P_and_p_thread_suffix_work_llgs(self):
         self.init_llgs_test()
diff --git a/lldb/source/API/SBCompileUnit.cpp b/lldb/source/API/SBCompileUnit.cpp
index 581bda3635073..d52040d850a95 100644
--- a/lldb/source/API/SBCompileUnit.cpp
+++ b/lldb/source/API/SBCompileUnit.cpp
@@ -50,7 +50,7 @@ SBFileSpec SBCompileUnit::GetFileSpec() const {
 
   SBFileSpec file_spec;
   if (m_opaque_ptr)
-    file_spec.SetFileSpec(*m_opaque_ptr);
+    file_spec.SetFileSpec(m_opaque_ptr->GetPrimaryFile());
   return LLDB_RECORD_RESULT(file_spec);
 }
 
@@ -106,7 +106,7 @@ uint32_t SBCompileUnit::FindLineEntryIndex(uint32_t start_idx, uint32_t line,
     if (inline_file_spec && inline_file_spec->IsValid())
       file_spec = inline_file_spec->ref();
     else
-      file_spec = *m_opaque_ptr;
+      file_spec = m_opaque_ptr->GetPrimaryFile();
 
     index = m_opaque_ptr->FindLineEntry(
         start_idx, line, inline_file_spec ? inline_file_spec->get() : nullptr,
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 82dc60489008c..090a3a57a2f4a 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -294,7 +294,7 @@ void SBDebugger::SetInputFileHandle(FILE *fh, bool transfer_ownership) {
 
 SBError SBDebugger::SetInputFile(FileSP file_sp) {
   LLDB_RECORD_METHOD(SBError, SBDebugger, SetInputFile, (FileSP), file_sp);
-  return SetInputFile(SBFile(file_sp));
+  return LLDB_RECORD_RESULT(SetInputFile(SBFile(file_sp)));
 }
 
 // Shouldn't really be settable after initialization as this could cause lots
@@ -306,7 +306,7 @@ SBError SBDebugger::SetInputFile(SBFile file) {
   SBError error;
   if (!m_opaque_sp) {
     error.ref().SetErrorString("invalid debugger");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
 
   repro::DataRecorder *recorder = nullptr;
@@ -330,16 +330,16 @@ SBError SBDebugger::SetInputFile(SBFile file) {
 
   if (!file_sp || !file_sp->IsValid()) {
     error.ref().SetErrorString("invalid file");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
 
   m_opaque_sp->SetInputFile(file_sp, recorder);
-  return error;
+  return LLDB_RECORD_RESULT(error);
 }
 
 SBError SBDebugger::SetOutputFile(FileSP file_sp) {
   LLDB_RECORD_METHOD(SBError, SBDebugger, SetOutputFile, (FileSP), file_sp);
-  return SetOutputFile(SBFile(file_sp));
+  return LLDB_RECORD_RESULT(SetOutputFile(SBFile(file_sp)));
 }
 
 void SBDebugger::SetOutputFileHandle(FILE *fh, bool transfer_ownership) {
@@ -353,14 +353,14 @@ SBError SBDebugger::SetOutputFile(SBFile file) {
   SBError error;
   if (!m_opaque_sp) {
     error.ref().SetErrorString("invalid debugger");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
   if (!file) {
     error.ref().SetErrorString("invalid file");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
   m_opaque_sp->SetOutputFile(file.m_opaque_sp);
-  return error;
+  return LLDB_RECORD_RESULT(error);
 }
 
 void SBDebugger::SetErrorFileHandle(FILE *fh, bool transfer_ownership) {
@@ -371,7 +371,7 @@ void SBDebugger::SetErrorFileHandle(FILE *fh, bool transfer_ownership) {
 
 SBError SBDebugger::SetErrorFile(FileSP file_sp) {
   LLDB_RECORD_METHOD(SBError, SBDebugger, SetErrorFile, (FileSP), file_sp);
-  return SetErrorFile(SBFile(file_sp));
+  return LLDB_RECORD_RESULT(SetErrorFile(SBFile(file_sp)));
 }
 
 SBError SBDebugger::SetErrorFile(SBFile file) {
@@ -379,14 +379,14 @@ SBError SBDebugger::SetErrorFile(SBFile file) {
   SBError error;
   if (!m_opaque_sp) {
     error.ref().SetErrorString("invalid debugger");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
   if (!file) {
     error.ref().SetErrorString("invalid file");
-    return error;
+    return LLDB_RECORD_RESULT(error);
   }
   m_opaque_sp->SetErrorFile(file.m_opaque_sp);
-  return error;
+  return LLDB_RECORD_RESULT(error);
 }
 
 FILE *SBDebugger::GetInputFileHandle() {
@@ -395,7 +395,7 @@ FILE *SBDebugger::GetInputFileHandle() {
     File &file_sp = m_opaque_sp->GetInputFile();
     return LLDB_RECORD_RESULT(file_sp.GetStream());
   }
-  return nullptr;
+  return LLDB_RECORD_RESULT(nullptr);
 }
 
 SBFile SBDebugger::GetInputFile() {
@@ -412,7 +412,7 @@ FILE *SBDebugger::GetOutputFileHandle() {
     StreamFile &stream_file = m_opaque_sp->GetOutputStream();
     return LLDB_RECORD_RESULT(stream_file.GetFile().GetStream());
   }
-  return nullptr;
+  return LLDB_RECORD_RESULT(nullptr);
 }
 
 SBFile SBDebugger::GetOutputFile() {
@@ -431,7 +431,7 @@ FILE *SBDebugger::GetErrorFileHandle() {
     StreamFile &stream_file = m_opaque_sp->GetErrorStream();
     return LLDB_RECORD_RESULT(stream_file.GetFile().GetStream());
   }
-  return nullptr;
+  return LLDB_RECORD_RESULT(nullptr);
 }
 
 SBFile SBDebugger::GetErrorFile() {
diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp
index f5a38efe4a779..277402f31abf7 100644
--- a/lldb/source/API/SBFile.cpp
+++ b/lldb/source/API/SBFile.cpp
@@ -100,24 +100,27 @@ SBError SBFile::Close() {
 
 SBFile::operator bool() const {
   LLDB_RECORD_METHOD_CONST_NO_ARGS(bool, SBFile, operator bool);
-  return LLDB_RECORD_RESULT(IsValid());
+  return IsValid();
 }
 
 bool SBFile::operator!() const {
   LLDB_RECORD_METHOD_CONST_NO_ARGS(bool, SBFile, operator!);
-  return LLDB_RECORD_RESULT(!IsValid());
+  return !IsValid();
 }
 
 FileSP SBFile::GetFile() const {
   LLDB_RECORD_METHOD_CONST_NO_ARGS(FileSP, SBFile, GetFile);
-  return m_opaque_sp;
+  return LLDB_RECORD_RESULT(m_opaque_sp);
 }
 
 namespace lldb_private {
 namespace repro {
 
 template <> void RegisterMethods<SBFile>(Registry &R) {
-
+  LLDB_REGISTER_CONSTRUCTOR(SBFile, ());
+  LLDB_REGISTER_CONSTRUCTOR(SBFile, (FileSP));
+  LLDB_REGISTER_CONSTRUCTOR(SBFile, (FILE *, bool));
+  LLDB_REGISTER_CONSTRUCTOR(SBFile, (int, const char *, bool));
   LLDB_REGISTER_METHOD(lldb::SBError, SBFile, Flush, ());
   LLDB_REGISTER_METHOD_CONST(bool, SBFile, IsValid, ());
   LLDB_REGISTER_METHOD_CONST(bool, SBFile, operator bool,());
diff --git a/lldb/source/API/SBFileSpec.cpp b/lldb/source/API/SBFileSpec.cpp
index 2f910b9ba294e..2e7eba42bc909 100644
--- a/lldb/source/API/SBFileSpec.cpp
+++ b/lldb/source/API/SBFileSpec.cpp
@@ -143,7 +143,7 @@ void SBFileSpec::SetDirectory(const char *directory) {
 }
 
 uint32_t SBFileSpec::GetPath(char *dst_path, size_t dst_len) const {
-  LLDB_RECORD_METHOD_CONST(uint32_t, SBFileSpec, GetPath, (char *, size_t),
+  LLDB_RECORD_DUMMY(uint32_t, SBFileSpec, GetPath, (char *, size_t),
                            dst_path, dst_len);
 
   uint32_t result = m_opaque_up->GetPath(dst_path, dst_len);
diff --git a/lldb/source/API/SBModule.cpp b/lldb/source/API/SBModule.cpp
index 7ac189bb42737..4e9dfb0c1e62e 100644
--- a/lldb/source/API/SBModule.cpp
+++ b/lldb/source/API/SBModule.cpp
@@ -245,7 +245,7 @@ bool SBModule::GetDescription(SBStream &description) {
 
   ModuleSP module_sp(GetSP());
   if (module_sp) {
-    module_sp->GetDescription(&strm);
+    module_sp->GetDescription(strm.AsRawOstream());
   } else
     strm.PutCString("No value");
 
diff --git a/lldb/source/API/SBReproducer.cpp b/lldb/source/API/SBReproducer.cpp
index d50d95ebb5476..1107161a419f1 100644
--- a/lldb/source/API/SBReproducer.cpp
+++ b/lldb/source/API/SBReproducer.cpp
@@ -22,8 +22,8 @@
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBHostOS.h"
 #include "lldb/API/SBReproducer.h"
-
 #include "lldb/Host/FileSystem.h"
+#include "lldb/lldb-private.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -124,7 +124,7 @@ const char *SBReproducer::Capture(const char *path) {
   return nullptr;
 }
 
-const char *SBReproducer::Replay(const char *path) {
+const char *SBReproducer::Replay(const char *path, bool skip_version_check) {
   static std::string error;
   if (auto e = Reproducer::Initialize(ReproducerMode::Replay, FileSpec(path))) {
     error = llvm::toString(std::move(e));
@@ -137,6 +137,22 @@ const char *SBReproducer::Replay(const char *path) {
     return error.c_str();
   }
 
+  if (!skip_version_check) {
+    llvm::Expected<std::string> version = loader->LoadBuffer<VersionProvider>();
+    if (!version) {
+      error = llvm::toString(version.takeError());
+      return error.c_str();
+    }
+    if (lldb_private::GetVersion() != llvm::StringRef(*version).rtrim()) {
+      error = "reproducer capture and replay version don't match:\n";
+      error.append("reproducer captured with:\n");
+      error.append(*version);
+      error.append("reproducer replayed with:\n");
+      error.append(lldb_private::GetVersion());
+      return error.c_str();
+    }
+  }
+
   FileSpec file = loader->GetFile<SBProvider::Info>();
   if (!file) {
     error = "unable to get replay data from reproducer.";
diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp
index 7013e2b45e5ff..312e4df758631 100644
--- a/lldb/source/API/SBTarget.cpp
+++ b/lldb/source/API/SBTarget.cpp
@@ -1176,12 +1176,15 @@ bool SBTarget::FindBreakpointsByName(const char *name,
   TargetSP target_sp(GetSP());
   if (target_sp) {
     std::lock_guard<std::recursive_mutex> guard(target_sp->GetAPIMutex());
-    BreakpointList bkpt_list(false);
-    bool is_valid =
-        target_sp->GetBreakpointList().FindBreakpointsByName(name, bkpt_list);
-    if (!is_valid)
+    llvm::Expected<std::vector<BreakpointSP>> expected_vector =
+        target_sp->GetBreakpointList().FindBreakpointsByName(name);
+    if (!expected_vector) {
+      LLDB_LOG(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_BREAKPOINTS),
+               "invalid breakpoint name: {}",
+               llvm::toString(expected_vector.takeError()));
       return false;
-    for (BreakpointSP bkpt_sp : bkpt_list.Breakpoints()) {
+    }
+    for (BreakpointSP bkpt_sp : *expected_vector) {
       bkpts.AppendByID(bkpt_sp->GetID());
     }
   }
diff --git a/lldb/source/API/SBThread.cpp b/lldb/source/API/SBThread.cpp
index 8d4930bf6edb0..f7f748f568321 100644
--- a/lldb/source/API/SBThread.cpp
+++ b/lldb/source/API/SBThread.cpp
@@ -914,9 +914,10 @@ SBError SBThread::StepOverUntil(lldb::SBFrame &sb_frame,
     const bool exact = false;
 
     SymbolContextList sc_list;
-    const uint32_t num_matches = frame_sc.comp_unit->ResolveSymbolContext(
-        step_file_spec, line, check_inlines, exact, eSymbolContextLineEntry,
-        sc_list);
+    frame_sc.comp_unit->ResolveSymbolContext(step_file_spec, line,
+                                             check_inlines, exact,
+                                             eSymbolContextLineEntry, sc_list);
+    const uint32_t num_matches = sc_list.GetSize();
     if (num_matches > 0) {
       SymbolContext sc;
       for (uint32_t i = 0; i < num_matches; ++i) {
@@ -1036,7 +1037,7 @@ SBError SBThread::JumpToLine(lldb::SBFileSpec &file_spec, uint32_t line) {
 
   Thread *thread = exe_ctx.GetThreadPtr();
 
-  Status err = thread->JumpToLine(file_spec.get(), line, true);
+  Status err = thread->JumpToLine(file_spec.ref(), line, true);
   sb_error.SetError(err);
   return LLDB_RECORD_RESULT(sb_error);
 }
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index a112542803c47..13acf4bb92e20 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -638,7 +638,8 @@ static bool SymbolContextsMightBeEquivalent(SymbolContext &old_sc,
   } else {
     // Otherwise we will compare by name...
     if (old_sc.comp_unit && new_sc.comp_unit) {
-      if (FileSpec::Equal(*old_sc.comp_unit, *new_sc.comp_unit, true)) {
+      if (old_sc.comp_unit->GetPrimaryFile() ==
+          new_sc.comp_unit->GetPrimaryFile()) {
         // Now check the functions:
         if (old_sc.function && new_sc.function &&
             (old_sc.function->GetName() == new_sc.function->GetName())) {
diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp
index c80fb917b4903..5b23c633d14c6 100644
--- a/lldb/source/Breakpoint/BreakpointList.cpp
+++ b/lldb/source/Breakpoint/BreakpointList.cpp
@@ -10,6 +10,8 @@
 
 #include "lldb/Target/Target.h"
 
+#include "llvm/Support/Errc.h"
+
 using namespace lldb;
 using namespace lldb_private;
 
@@ -128,22 +130,24 @@ BreakpointSP BreakpointList::FindBreakpointByID(break_id_t break_id) const {
   return {};
 }
 
-bool BreakpointList::FindBreakpointsByName(const char *name,
-                                           BreakpointList &matching_bps) {
-  Status error;
+llvm::Expected<std::vector<lldb::BreakpointSP>>
+BreakpointList::FindBreakpointsByName(const char *name) {
   if (!name)
-    return false;
+    return llvm::createStringError(llvm::errc::invalid_argument,
+                                   "FindBreakpointsByName requires a name");
 
+  Status error;
   if (!BreakpointID::StringIsBreakpointName(llvm::StringRef(name), error))
-    return false;
+    return error.ToError();
 
+  std::vector<lldb::BreakpointSP> matching_bps;
   for (BreakpointSP bkpt_sp : Breakpoints()) {
     if (bkpt_sp->MatchesName(name)) {
-      matching_bps.Add(bkpt_sp, false);
+      matching_bps.push_back(bkpt_sp);
     }
   }
 
-  return true;
+  return matching_bps;
 }
 
 void BreakpointList::Dump(Stream *s) const {
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 46b8f25c56682..e6d7d85f90605 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -525,7 +525,7 @@ void BreakpointLocation::GetDescription(Stream *s,
       if (sc.comp_unit != nullptr) {
         s->EOL();
         s->Indent("compile unit = ");
-        static_cast<FileSpec *>(sc.comp_unit)->GetFilename().Dump(s);
+        sc.comp_unit->GetPrimaryFile().GetFilename().Dump(s);
 
         if (sc.function != nullptr) {
           s->EOL();
diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp
index 0d4c6173c3c54..8fd16f420c04f 100644
--- a/lldb/source/Breakpoint/BreakpointOptions.cpp
+++ b/lldb/source/Breakpoint/BreakpointOptions.cpp
@@ -566,7 +566,8 @@ void BreakpointOptions::GetDescription(Stream *s,
   if (m_callback_baton_sp.get()) {
     if (level != eDescriptionLevelBrief) {
       s->EOL();
-      m_callback_baton_sp->GetDescription(s, level);
+      m_callback_baton_sp->GetDescription(s->AsRawOstream(), level,
+                                          s->GetIndentLevel());
     }
   }
   if (!m_condition_text.empty()) {
@@ -578,35 +579,33 @@ void BreakpointOptions::GetDescription(Stream *s,
 }
 
 void BreakpointOptions::CommandBaton::GetDescription(
-    Stream *s, lldb::DescriptionLevel level) const {
+    llvm::raw_ostream &s, lldb::DescriptionLevel level,
+    unsigned indentation) const {
   const CommandData *data = getItem();
 
   if (level == eDescriptionLevelBrief) {
-    s->Printf(", commands = %s",
-              (data && data->user_source.GetSize() > 0) ? "yes" : "no");
+    s << ", commands = "
+      << ((data && data->user_source.GetSize() > 0) ? "yes" : "no");
     return;
   }
 
-  s->IndentMore();
-  s->Indent("Breakpoint commands");
+  indentation += 2;
+  s.indent(indentation);
+  s << "Breakpoint commands";
   if (data->interpreter != eScriptLanguageNone)
-    s->Printf(" (%s):\n",
-              ScriptInterpreter::LanguageToString(data->interpreter).c_str());
+    s << llvm::formatv(" ({0}):\n",
+                       ScriptInterpreter::LanguageToString(data->interpreter));
   else
-    s->PutCString(":\n");
+    s << ":\n";
 
-  s->IndentMore();
+  indentation += 2;
   if (data && data->user_source.GetSize() > 0) {
-    const size_t num_strings = data->user_source.GetSize();
-    for (size_t i = 0; i < num_strings; ++i) {
-      s->Indent(data->user_source.GetStringAtIndex(i));
-      s->EOL();
+    for (llvm::StringRef str : data->user_source) {
+      s.indent(indentation);
+      s << str << "\n";
     }
-  } else {
-    s->PutCString("No commands.\n");
-  }
-  s->IndentLess();
-  s->IndentLess();
+  } else
+    s << "No commands.\n";
 }
 
 void BreakpointOptions::SetCommandDataCallback(
diff --git a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
index 3cb04263c6dcb..6b600a7cf128f 100644
--- a/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
+++ b/lldb/source/Breakpoint/BreakpointResolverFileRegex.cpp
@@ -102,7 +102,7 @@ Searcher::CallbackReturn BreakpointResolverFileRegex::SearchCallback(
     return eCallbackReturnContinue;
 
   CompileUnit *cu = context.comp_unit;
-  FileSpec cu_file_spec = *(static_cast<FileSpec *>(cu));
+  FileSpec cu_file_spec = cu->GetPrimaryFile();
   std::vector<uint32_t> line_matches;
   context.target_sp->GetSourceManager().FindLinesMatchingRegex(
       cu_file_spec, m_regex, 1, UINT32_MAX, line_matches);
diff --git a/lldb/source/Breakpoint/WatchpointOptions.cpp b/lldb/source/Breakpoint/WatchpointOptions.cpp
index cd5ef930e5dcf..026bf2f746aef 100644
--- a/lldb/source/Breakpoint/WatchpointOptions.cpp
+++ b/lldb/source/Breakpoint/WatchpointOptions.cpp
@@ -121,7 +121,8 @@ void WatchpointOptions::GetCallbackDescription(
     Stream *s, lldb::DescriptionLevel level) const {
   if (m_callback_baton_sp.get()) {
     s->EOL();
-    m_callback_baton_sp->GetDescription(s, level);
+    m_callback_baton_sp->GetDescription(s->AsRawOstream(), level,
+                                        s->GetIndentLevel());
   }
 }
 
@@ -156,27 +157,26 @@ void WatchpointOptions::GetDescription(Stream *s,
 }
 
 void WatchpointOptions::CommandBaton::GetDescription(
-    Stream *s, lldb::DescriptionLevel level) const {
+    llvm::raw_ostream &s, lldb::DescriptionLevel level,
+    unsigned indentation) const {
   const CommandData *data = getItem();
 
   if (level == eDescriptionLevelBrief) {
-    s->Printf(", commands = %s",
-              (data && data->user_source.GetSize() > 0) ? "yes" : "no");
+    s << ", commands = %s"
+      << ((data && data->user_source.GetSize() > 0) ? "yes" : "no");
     return;
   }
 
-  s->IndentMore();
-  s->Indent("watchpoint commands:\n");
+  indentation += 2;
+  s.indent(indentation);
+  s << "watchpoint commands:\n";
 
-  s->IndentMore();
+  indentation += 2;
   if (data && data->user_source.GetSize() > 0) {
     for (const std::string &line : data->user_source) {
-      s->Indent(line);
-      s->EOL();
+      s.indent(indentation);
+      s << line << "\n";
     }
-  } else {
-    s->PutCString("No commands.\n");
-  }
-  s->IndentLess();
-  s->IndentLess();
+  } else
+    s << "No commands.\n";
 }
diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp
index 469a6bbbadf65..b382e26e2b704 100644
--- a/lldb/source/Commands/CommandCompletions.cpp
+++ b/lldb/source/Commands/CommandCompletions.cpp
@@ -378,8 +378,10 @@ CommandCompletions::SourceFileCompleter::SearchCallback(SearchFilter &filter,
         }
       }
     } else {
-      const char *cur_file_name = context.comp_unit->GetFilename().GetCString();
-      const char *cur_dir_name = context.comp_unit->GetDirectory().GetCString();
+      const char *cur_file_name =
+          context.comp_unit->GetPrimaryFile().GetFilename().GetCString();
+      const char *cur_dir_name =
+          context.comp_unit->GetPrimaryFile().GetDirectory().GetCString();
 
       bool match = false;
       if (m_file_name && cur_file_name &&
@@ -391,7 +393,7 @@ CommandCompletions::SourceFileCompleter::SearchCallback(SearchFilter &filter,
         match = false;
 
       if (match) {
-        m_matching_files.AppendIfUnique(context.comp_unit);
+        m_matching_files.AppendIfUnique(context.comp_unit->GetPrimaryFile());
       }
     }
   }
@@ -411,10 +413,7 @@ void CommandCompletions::SourceFileCompleter::DoCompletion(
 // SymbolCompleter
 
 static bool regex_chars(const char comp) {
-  return (comp == '[' || comp == ']' || comp == '(' || comp == ')' ||
-          comp == '{' || comp == '}' || comp == '+' || comp == '.' ||
-          comp == '*' || comp == '|' || comp == '^' || comp == '$' ||
-          comp == '\\' || comp == '?');
+  return llvm::StringRef("[](){}+.*|^$\\?").contains(comp);
 }
 
 CommandCompletions::SymbolCompleter::SymbolCompleter(
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index 1a4432149f731..a82e70a1cdaba 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -674,10 +674,10 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
             if (baton) {
               result.GetOutputStream().Printf("Breakpoint %s:\n",
                                               id_str.GetData());
-              result.GetOutputStream().IndentMore();
-              baton->GetDescription(&result.GetOutputStream(),
-                                    eDescriptionLevelFull);
-              result.GetOutputStream().IndentLess();
+              baton->GetDescription(result.GetOutputStream().AsRawOstream(),
+                                    eDescriptionLevelFull,
+                                    result.GetOutputStream().GetIndentLevel() +
+                                        2);
             } else {
               result.AppendMessageWithFormat(
                   "Breakpoint %s does not have an associated command.\n",
diff --git a/lldb/source/Commands/CommandObjectGUI.cpp b/lldb/source/Commands/CommandObjectGUI.cpp
index fac2e96277839..898468a977f3f 100644
--- a/lldb/source/Commands/CommandObjectGUI.cpp
+++ b/lldb/source/Commands/CommandObjectGUI.cpp
@@ -8,6 +8,7 @@
 
 #include "CommandObjectGUI.h"
 
+#include "lldb/Core/IOHandlerCursesGUI.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/lldb-private.h"
diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp
index fd1b158afb16d..807c04f4c65e5 100644
--- a/lldb/source/Commands/CommandObjectSource.cpp
+++ b/lldb/source/Commands/CommandObjectSource.cpp
@@ -146,12 +146,6 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
     Target *target = m_exe_ctx.GetTargetPtr();
 
     uint32_t num_matches = 0;
-    bool has_path = false;
-    if (file_spec) {
-      assert(file_spec.GetFilename().AsCString());
-      has_path = (file_spec.GetDirectory().AsCString() != nullptr);
-    }
-
     // Dump all the line entries for the file in the list.
     ConstString last_module_file_name;
     uint32_t num_scs = sc_list.GetSize();
@@ -168,8 +162,7 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
         if (module_list.GetSize() &&
             module_list.GetIndexForModule(module) == LLDB_INVALID_INDEX32)
           continue;
-        if (file_spec && !lldb_private::FileSpec::Equal(
-                             file_spec, line_entry.file, has_path))
+        if (!FileSpec::Match(file_spec, line_entry.file))
           continue;
         if (start_line > 0 && line_entry.line < start_line)
           continue;
@@ -250,13 +243,13 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
             num_matches++;
             if (num_lines > 0 && num_matches > num_lines)
               break;
-            assert(lldb_private::FileSpec::Equal(cu_file_spec, line_entry.file,
-                                                 has_path));
+            assert(cu_file_spec == line_entry.file);
             if (!cu_header_printed) {
               if (num_matches > 0)
                 strm << "\n\n";
               strm << "Lines found for file " << file_spec_name
-                   << " in compilation unit " << cu->GetFilename() << " in `"
+                   << " in compilation unit "
+                   << cu->GetPrimaryFile().GetFilename() << " in `"
                    << module_file_name << "\n";
               cu_header_printed = true;
             }
@@ -1077,7 +1070,8 @@ class CommandObjectSourceList : public CommandObjectParsed {
           if (m_options.show_bp_locs) {
             m_breakpoint_locations.Clear();
             const bool show_inlines = true;
-            m_breakpoint_locations.Reset(*sc.comp_unit, 0, show_inlines);
+            m_breakpoint_locations.Reset(sc.comp_unit->GetPrimaryFile(), 0,
+                                         show_inlines);
             SearchFilterForUnconstrainedSearches target_search_filter(
                 target->shared_from_this());
             target_search_filter.Search(m_breakpoint_locations);
@@ -1106,8 +1100,8 @@ class CommandObjectSourceList : public CommandObjectParsed {
                   ? sc.line_entry.column
                   : 0;
           target->GetSourceManager().DisplaySourceLinesWithLineNumbers(
-              sc.comp_unit, sc.line_entry.line, column, lines_to_back_up,
-              m_options.num_lines - lines_to_back_up, "->",
+              sc.comp_unit->GetPrimaryFile(), sc.line_entry.line, column,
+              lines_to_back_up, m_options.num_lines - lines_to_back_up, "->",
               &result.GetOutputStream(), GetBreakpointLocations());
           result.SetStatus(eReturnStatusSuccessFinishResult);
         }
@@ -1190,18 +1184,18 @@ class CommandObjectSourceList : public CommandObjectParsed {
 
       if (num_matches > 1) {
         bool got_multiple = false;
-        FileSpec *test_cu_spec = nullptr;
+        CompileUnit *test_cu = nullptr;
 
         for (unsigned i = 0; i < num_matches; i++) {
           SymbolContext sc;
           sc_list.GetContextAtIndex(i, sc);
           if (sc.comp_unit) {
-            if (test_cu_spec) {
-              if (test_cu_spec != static_cast<FileSpec *>(sc.comp_unit))
+            if (test_cu) {
+              if (test_cu != sc.comp_unit)
                 got_multiple = true;
               break;
             } else
-              test_cu_spec = sc.comp_unit;
+              test_cu = sc.comp_unit;
           }
         }
         if (got_multiple) {
@@ -1218,7 +1212,8 @@ class CommandObjectSourceList : public CommandObjectParsed {
         if (sc.comp_unit) {
           if (m_options.show_bp_locs) {
             const bool show_inlines = true;
-            m_breakpoint_locations.Reset(*sc.comp_unit, 0, show_inlines);
+            m_breakpoint_locations.Reset(sc.comp_unit->GetPrimaryFile(), 0,
+                                         show_inlines);
             SearchFilterForUnconstrainedSearches target_search_filter(
                 target->shared_from_this());
             target_search_filter.Search(m_breakpoint_locations);
@@ -1229,7 +1224,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
             m_options.num_lines = 10;
           const uint32_t column = 0;
           target->GetSourceManager().DisplaySourceLinesWithLineNumbers(
-              sc.comp_unit, m_options.start_line, column, 0,
+              sc.comp_unit->GetPrimaryFile(), m_options.start_line, column, 0,
               m_options.num_lines, "", &result.GetOutputStream(),
               GetBreakpointLocations());
 
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index d77207bb82cfc..ac3188740234e 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -78,7 +78,7 @@ static void DumpTargetInfo(uint32_t target_idx, Target *target,
   uint32_t properties = 0;
   if (target_arch.IsValid()) {
     strm.Printf("%sarch=", properties++ > 0 ? ", " : " ( ");
-    target_arch.DumpTriple(strm);
+    target_arch.DumpTriple(strm.AsRawOstream());
     properties++;
   }
   PlatformSP platform_sp(target->GetPlatform());
@@ -816,15 +816,14 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
       return;
     if (sc.module_sp) {
       if (sc.comp_unit) {
-        s.Printf("Global variables for %s in %s:\n",
-                 sc.comp_unit->GetPath().c_str(),
-                 sc.module_sp->GetFileSpec().GetPath().c_str());
+        s.Format("Global variables for {0} in {1}:\n",
+                 sc.comp_unit->GetPrimaryFile(), sc.module_sp->GetFileSpec());
       } else {
         s.Printf("Global variables for %s\n",
                  sc.module_sp->GetFileSpec().GetPath().c_str());
       }
     } else if (sc.comp_unit) {
-      s.Printf("Global variables for %s\n", sc.comp_unit->GetPath().c_str());
+      s.Format("Global variables for {0}\n", sc.comp_unit->GetPrimaryFile());
     }
 
     for (VariableSP var_sp : variable_list) {
@@ -926,9 +925,9 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
         if (!success) {
           if (frame) {
             if (comp_unit)
-              result.AppendErrorWithFormat(
-                  "no global variables in current compile unit: %s\n",
-                  comp_unit->GetPath().c_str());
+              result.AppendErrorWithFormatv(
+                  "no global variables in current compile unit: {0}\n",
+                  comp_unit->GetPrimaryFile());
             else
               result.AppendErrorWithFormat(
                   "no debug information for frame %u\n",
@@ -1292,7 +1291,7 @@ static void DumpModuleArchitecture(Stream &strm, Module *module,
     StreamString arch_strm;
 
     if (full_triple)
-      module->GetArchitecture().DumpTriple(arch_strm);
+      module->GetArchitecture().DumpTriple(arch_strm.AsRawOstream());
     else
       arch_strm.PutCString(module->GetArchitecture().GetArchitectureName());
     std::string arch_str = arch_strm.GetString();
@@ -1327,8 +1326,8 @@ static uint32_t DumpCompileUnitLineTable(CommandInterpreter &interpreter,
         if (i > 0)
           strm << "\n\n";
 
-        strm << "Line table for " << *static_cast<FileSpec *>(sc.comp_unit)
-             << " in `" << module->GetFileSpec().GetFilename() << "\n";
+        strm << "Line table for " << sc.comp_unit->GetPrimaryFile() << " in `"
+             << module->GetFileSpec().GetFilename() << "\n";
         LineTable *line_table = sc.comp_unit->GetLineTable();
         if (line_table)
           line_table->GetDescription(
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index c93bd9d5c2323..13c17dfe3cca2 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -526,7 +526,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
                                 eCommandProcessMustBeLaunched |
                                 eCommandProcessMustBePaused),
         m_step_type(step_type), m_step_scope(step_scope), m_options(),
-        m_class_options("scripted step", 'C') {
+        m_class_options("scripted step") {
     CommandArgumentEntry arg;
     CommandArgumentData thread_id_arg;
 
@@ -1193,7 +1193,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
             LineEntry line_entry;
             const bool exact = false;
             start_idx_ptr = sc.comp_unit->FindLineEntry(
-                start_idx_ptr, line_number, sc.comp_unit, exact, &line_entry);
+                start_idx_ptr, line_number, nullptr, exact, &line_entry);
             if (start_idx_ptr == UINT32_MAX)
               break;
 
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index 5683381efc858..92a91cfac2208 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -611,10 +611,10 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
             const Baton *baton = wp_options->GetBaton();
             if (baton) {
               result.GetOutputStream().Printf("Watchpoint %u:\n", cur_wp_id);
-              result.GetOutputStream().IndentMore();
-              baton->GetDescription(&result.GetOutputStream(),
-                                    eDescriptionLevelFull);
-              result.GetOutputStream().IndentLess();
+              baton->GetDescription(result.GetOutputStream().AsRawOstream(),
+                                    eDescriptionLevelFull,
+                                    result.GetOutputStream().GetIndentLevel() +
+                                        2);
             } else {
               result.AppendMessageWithFormat(
                   "Watchpoint %u does not have an associated command.\n",
diff --git a/lldb/source/Core/AddressResolverFileLine.cpp b/lldb/source/Core/AddressResolverFileLine.cpp
index 4a14260c6c72f..4122b5d3b747d 100644
--- a/lldb/source/Core/AddressResolverFileLine.cpp
+++ b/lldb/source/Core/AddressResolverFileLine.cpp
@@ -40,14 +40,13 @@ Searcher::CallbackReturn
 AddressResolverFileLine::SearchCallback(SearchFilter &filter,
                                         SymbolContext &context, Address *addr) {
   SymbolContextList sc_list;
-  uint32_t sc_list_size;
   CompileUnit *cu = context.comp_unit;
 
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_BREAKPOINTS));
 
-  sc_list_size =
-      cu->ResolveSymbolContext(m_file_spec, m_line_number, m_inlines, false,
-                               eSymbolContextEverything, sc_list);
+  cu->ResolveSymbolContext(m_file_spec, m_line_number, m_inlines, false,
+                           eSymbolContextEverything, sc_list);
+  uint32_t sc_list_size = sc_list.GetSize();
   for (uint32_t i = 0; i < sc_list_size; i++) {
     SymbolContext sc;
     if (sc_list.GetContextAtIndex(i, sc)) {
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index f3ce87ae4f231..a6f7ba8dc25ba 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -37,6 +37,7 @@ add_lldb_library(lldbCore
   FormatEntity.cpp
   Highlighter.cpp
   IOHandler.cpp
+  IOHandlerCursesGUI.cpp
   Mangled.cpp
   Module.cpp
   ModuleChild.cpp
diff --git a/lldb/source/Core/FileLineResolver.cpp b/lldb/source/Core/FileLineResolver.cpp
index 01df295398a83..7d91d1a3e472c 100644
--- a/lldb/source/Core/FileLineResolver.cpp
+++ b/lldb/source/Core/FileLineResolver.cpp
@@ -36,8 +36,8 @@ FileLineResolver::SearchCallback(SearchFilter &filter, SymbolContext &context,
                                  Address *addr) {
   CompileUnit *cu = context.comp_unit;
 
-  if (m_inlines ||
-      m_file_spec.Compare(*cu, m_file_spec, (bool)m_file_spec.GetDirectory())) {
+  if (m_inlines || m_file_spec.Compare(cu->GetPrimaryFile(), m_file_spec,
+                                       (bool)m_file_spec.GetDirectory())) {
     uint32_t start_file_idx = 0;
     uint32_t file_idx =
         cu->GetSupportFiles().FindFileIndex(start_file_idx, m_file_spec, false);
diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index c90828f40989c..07ca0a68a10b4 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -1376,8 +1376,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
     if (sc) {
       CompileUnit *cu = sc->comp_unit;
       if (cu) {
-        // CompileUnit is a FileSpec
-        if (DumpFile(s, *cu, (FileKind)entry.number))
+        if (DumpFile(s, cu->GetPrimaryFile(), (FileKind)entry.number))
           return true;
       }
     }
diff --git a/lldb/source/Core/IOHandler.cpp b/lldb/source/Core/IOHandler.cpp
index d11248094e056..38e65e63d5870 100644
--- a/lldb/source/Core/IOHandler.cpp
+++ b/lldb/source/Core/IOHandler.cpp
@@ -8,11 +8,6 @@
 
 #include "lldb/Core/IOHandler.h"
 
-#ifndef LLDB_DISABLE_CURSES
-#include <curses.h>
-#include <panel.h>
-#endif
-
 #if defined(__APPLE__)
 #include <deque>
 #endif
@@ -32,24 +27,6 @@
 #endif
 #include "lldb/Interpreter/CommandCompletions.h"
 #include "lldb/Interpreter/CommandInterpreter.h"
-#ifndef LLDB_DISABLE_CURSES
-#include "lldb/Breakpoint/BreakpointLocation.h"
-#include "lldb/Core/Module.h"
-#include "lldb/Core/ValueObject.h"
-#include "lldb/Core/ValueObjectRegister.h"
-#include "lldb/Symbol/Block.h"
-#include "lldb/Symbol/Function.h"
-#include "lldb/Symbol/Symbol.h"
-#include "lldb/Symbol/VariableList.h"
-#include "lldb/Target/Process.h"
-#include "lldb/Target/RegisterContext.h"
-#include "lldb/Target/StackFrame.h"
-#include "lldb/Target/StopInfo.h"
-#include "lldb/Target/Target.h"
-#include "lldb/Target/Thread.h"
-#include "lldb/Utility/State.h"
-#endif
-
 #include "llvm/ADT/StringRef.h"
 
 #ifdef _WIN32
@@ -631,3994 +608,3 @@ void IOHandlerEditline::PrintAsync(Stream *stream, const char *s, size_t len) {
 #endif
   }
 }
-
-// we may want curses to be disabled for some builds for instance, windows
-#ifndef LLDB_DISABLE_CURSES
-
-#define KEY_RETURN 10
-#define KEY_ESCAPE 27
-
-namespace curses {
-class Menu;
-class MenuDelegate;
-class Window;
-class WindowDelegate;
-typedef std::shared_ptr<Menu> MenuSP;
-typedef std::shared_ptr<MenuDelegate> MenuDelegateSP;
-typedef std::shared_ptr<Window> WindowSP;
-typedef std::shared_ptr<WindowDelegate> WindowDelegateSP;
-typedef std::vector<MenuSP> Menus;
-typedef std::vector<WindowSP> Windows;
-typedef std::vector<WindowDelegateSP> WindowDelegates;
-
-#if 0
-type summary add -s "x=${var.x}, y=${var.y}" curses::Point
-type summary add -s "w=${var.width}, h=${var.height}" curses::Size
-type summary add -s "${var.origin%S} ${var.size%S}" curses::Rect
-#endif
-
-struct Point {
-  int x;
-  int y;
-
-  Point(int _x = 0, int _y = 0) : x(_x), y(_y) {}
-
-  void Clear() {
-    x = 0;
-    y = 0;
-  }
-
-  Point &operator+=(const Point &rhs) {
-    x += rhs.x;
-    y += rhs.y;
-    return *this;
-  }
-
-  void Dump() { printf("(x=%i, y=%i)\n", x, y); }
-};
-
-bool operator==(const Point &lhs, const Point &rhs) {
-  return lhs.x == rhs.x && lhs.y == rhs.y;
-}
-
-bool operator!=(const Point &lhs, const Point &rhs) {
-  return lhs.x != rhs.x || lhs.y != rhs.y;
-}
-
-struct Size {
-  int width;
-  int height;
-  Size(int w = 0, int h = 0) : width(w), height(h) {}
-
-  void Clear() {
-    width = 0;
-    height = 0;
-  }
-
-  void Dump() { printf("(w=%i, h=%i)\n", width, height); }
-};
-
-bool operator==(const Size &lhs, const Size &rhs) {
-  return lhs.width == rhs.width && lhs.height == rhs.height;
-}
-
-bool operator!=(const Size &lhs, const Size &rhs) {
-  return lhs.width != rhs.width || lhs.height != rhs.height;
-}
-
-struct Rect {
-  Point origin;
-  Size size;
-
-  Rect() : origin(), size() {}
-
-  Rect(const Point &p, const Size &s) : origin(p), size(s) {}
-
-  void Clear() {
-    origin.Clear();
-    size.Clear();
-  }
-
-  void Dump() {
-    printf("(x=%i, y=%i), w=%i, h=%i)\n", origin.x, origin.y, size.width,
-           size.height);
-  }
-
-  void Inset(int w, int h) {
-    if (size.width > w * 2)
-      size.width -= w * 2;
-    origin.x += w;
-
-    if (size.height > h * 2)
-      size.height -= h * 2;
-    origin.y += h;
-  }
-
-  // Return a status bar rectangle which is the last line of this rectangle.
-  // This rectangle will be modified to not include the status bar area.
-  Rect MakeStatusBar() {
-    Rect status_bar;
-    if (size.height > 1) {
-      status_bar.origin.x = origin.x;
-      status_bar.origin.y = size.height;
-      status_bar.size.width = size.width;
-      status_bar.size.height = 1;
-      --size.height;
-    }
-    return status_bar;
-  }
-
-  // Return a menubar rectangle which is the first line of this rectangle. This
-  // rectangle will be modified to not include the menubar area.
-  Rect MakeMenuBar() {
-    Rect menubar;
-    if (size.height > 1) {
-      menubar.origin.x = origin.x;
-      menubar.origin.y = origin.y;
-      menubar.size.width = size.width;
-      menubar.size.height = 1;
-      ++origin.y;
-      --size.height;
-    }
-    return menubar;
-  }
-
-  void HorizontalSplitPercentage(float top_percentage, Rect &top,
-                                 Rect &bottom) const {
-    float top_height = top_percentage * size.height;
-    HorizontalSplit(top_height, top, bottom);
-  }
-
-  void HorizontalSplit(int top_height, Rect &top, Rect &bottom) const {
-    top = *this;
-    if (top_height < size.height) {
-      top.size.height = top_height;
-      bottom.origin.x = origin.x;
-      bottom.origin.y = origin.y + top.size.height;
-      bottom.size.width = size.width;
-      bottom.size.height = size.height - top.size.height;
-    } else {
-      bottom.Clear();
-    }
-  }
-
-  void VerticalSplitPercentage(float left_percentage, Rect &left,
-                               Rect &right) const {
-    float left_width = left_percentage * size.width;
-    VerticalSplit(left_width, left, right);
-  }
-
-  void VerticalSplit(int left_width, Rect &left, Rect &right) const {
-    left = *this;
-    if (left_width < size.width) {
-      left.size.width = left_width;
-      right.origin.x = origin.x + left.size.width;
-      right.origin.y = origin.y;
-      right.size.width = size.width - left.size.width;
-      right.size.height = size.height;
-    } else {
-      right.Clear();
-    }
-  }
-};
-
-bool operator==(const Rect &lhs, const Rect &rhs) {
-  return lhs.origin == rhs.origin && lhs.size == rhs.size;
-}
-
-bool operator!=(const Rect &lhs, const Rect &rhs) {
-  return lhs.origin != rhs.origin || lhs.size != rhs.size;
-}
-
-enum HandleCharResult {
-  eKeyNotHandled = 0,
-  eKeyHandled = 1,
-  eQuitApplication = 2
-};
-
-enum class MenuActionResult {
-  Handled,
-  NotHandled,
-  Quit // Exit all menus and quit
-};
-
-struct KeyHelp {
-  int ch;
-  const char *description;
-};
-
-class WindowDelegate {
-public:
-  virtual ~WindowDelegate() = default;
-
-  virtual bool WindowDelegateDraw(Window &window, bool force) {
-    return false; // Drawing not handled
-  }
-
-  virtual HandleCharResult WindowDelegateHandleChar(Window &window, int key) {
-    return eKeyNotHandled;
-  }
-
-  virtual const char *WindowDelegateGetHelpText() { return nullptr; }
-
-  virtual KeyHelp *WindowDelegateGetKeyHelp() { return nullptr; }
-};
-
-class HelpDialogDelegate : public WindowDelegate {
-public:
-  HelpDialogDelegate(const char *text, KeyHelp *key_help_array);
-
-  ~HelpDialogDelegate() override;
-
-  bool WindowDelegateDraw(Window &window, bool force) override;
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override;
-
-  size_t GetNumLines() const { return m_text.GetSize(); }
-
-  size_t GetMaxLineLength() const { return m_text.GetMaxStringLength(); }
-
-protected:
-  StringList m_text;
-  int m_first_visible_line;
-};
-
-class Window {
-public:
-  Window(const char *name)
-      : m_name(name), m_window(nullptr), m_panel(nullptr), m_parent(nullptr),
-        m_subwindows(), m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
-        m_prev_active_window_idx(UINT32_MAX), m_delete(false),
-        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {}
-
-  Window(const char *name, WINDOW *w, bool del = true)
-      : m_name(name), m_window(nullptr), m_panel(nullptr), m_parent(nullptr),
-        m_subwindows(), m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
-        m_prev_active_window_idx(UINT32_MAX), m_delete(del),
-        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
-    if (w)
-      Reset(w);
-  }
-
-  Window(const char *name, const Rect &bounds)
-      : m_name(name), m_window(nullptr), m_parent(nullptr), m_subwindows(),
-        m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
-        m_prev_active_window_idx(UINT32_MAX), m_delete(true),
-        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
-    Reset(::newwin(bounds.size.height, bounds.size.width, bounds.origin.y,
-                   bounds.origin.y));
-  }
-
-  virtual ~Window() {
-    RemoveSubWindows();
-    Reset();
-  }
-
-  void Reset(WINDOW *w = nullptr, bool del = true) {
-    if (m_window == w)
-      return;
-
-    if (m_panel) {
-      ::del_panel(m_panel);
-      m_panel = nullptr;
-    }
-    if (m_window && m_delete) {
-      ::delwin(m_window);
-      m_window = nullptr;
-      m_delete = false;
-    }
-    if (w) {
-      m_window = w;
-      m_panel = ::new_panel(m_window);
-      m_delete = del;
-    }
-  }
-
-  void AttributeOn(attr_t attr) { ::wattron(m_window, attr); }
-  void AttributeOff(attr_t attr) { ::wattroff(m_window, attr); }
-  void Box(chtype v_char = ACS_VLINE, chtype h_char = ACS_HLINE) {
-    ::box(m_window, v_char, h_char);
-  }
-  void Clear() { ::wclear(m_window); }
-  void Erase() { ::werase(m_window); }
-  Rect GetBounds() {
-    return Rect(GetParentOrigin(), GetSize());
-  } // Get the rectangle in our parent window
-  int GetChar() { return ::wgetch(m_window); }
-  int GetCursorX() { return getcurx(m_window); }
-  int GetCursorY() { return getcury(m_window); }
-  Rect GetFrame() {
-    return Rect(Point(), GetSize());
-  } // Get our rectangle in our own coordinate system
-  Point GetParentOrigin() { return Point(GetParentX(), GetParentY()); }
-  Size GetSize() { return Size(GetWidth(), GetHeight()); }
-  int GetParentX() { return getparx(m_window); }
-  int GetParentY() { return getpary(m_window); }
-  int GetMaxX() { return getmaxx(m_window); }
-  int GetMaxY() { return getmaxy(m_window); }
-  int GetWidth() { return GetMaxX(); }
-  int GetHeight() { return GetMaxY(); }
-  void MoveCursor(int x, int y) { ::wmove(m_window, y, x); }
-  void MoveWindow(int x, int y) { MoveWindow(Point(x, y)); }
-  void Resize(int w, int h) { ::wresize(m_window, h, w); }
-  void Resize(const Size &size) {
-    ::wresize(m_window, size.height, size.width);
-  }
-  void PutChar(int ch) { ::waddch(m_window, ch); }
-  void PutCString(const char *s, int len = -1) { ::waddnstr(m_window, s, len); }
-  void SetBackground(int color_pair_idx) {
-    ::wbkgd(m_window, COLOR_PAIR(color_pair_idx));
-  }
-
-  void PutCStringTruncated(const char *s, int right_pad) {
-    int bytes_left = GetWidth() - GetCursorX();
-    if (bytes_left > right_pad) {
-      bytes_left -= right_pad;
-      ::waddnstr(m_window, s, bytes_left);
-    }
-  }
-
-  void MoveWindow(const Point &origin) {
-    const bool moving_window = origin != GetParentOrigin();
-    if (m_is_subwin && moving_window) {
-      // Can't move subwindows, must delete and re-create
-      Size size = GetSize();
-      Reset(::subwin(m_parent->m_window, size.height, size.width, origin.y,
-                     origin.x),
-            true);
-    } else {
-      ::mvwin(m_window, origin.y, origin.x);
-    }
-  }
-
-  void SetBounds(const Rect &bounds) {
-    const bool moving_window = bounds.origin != GetParentOrigin();
-    if (m_is_subwin && moving_window) {
-      // Can't move subwindows, must delete and re-create
-      Reset(::subwin(m_parent->m_window, bounds.size.height, bounds.size.width,
-                     bounds.origin.y, bounds.origin.x),
-            true);
-    } else {
-      if (moving_window)
-        MoveWindow(bounds.origin);
-      Resize(bounds.size);
-    }
-  }
-
-  void Printf(const char *format, ...) __attribute__((format(printf, 2, 3))) {
-    va_list args;
-    va_start(args, format);
-    vwprintw(m_window, format, args);
-    va_end(args);
-  }
-
-  void Touch() {
-    ::touchwin(m_window);
-    if (m_parent)
-      m_parent->Touch();
-  }
-
-  WindowSP CreateSubWindow(const char *name, const Rect &bounds,
-                           bool make_active) {
-    auto get_window = [this, &bounds]() {
-      return m_window
-                 ? ::subwin(m_window, bounds.size.height, bounds.size.width,
-                            bounds.origin.y, bounds.origin.x)
-                 : ::newwin(bounds.size.height, bounds.size.width,
-                            bounds.origin.y, bounds.origin.x);
-    };
-    WindowSP subwindow_sp = std::make_shared<Window>(name, get_window(), true);
-    subwindow_sp->m_is_subwin = subwindow_sp.operator bool();
-    subwindow_sp->m_parent = this;
-    if (make_active) {
-      m_prev_active_window_idx = m_curr_active_window_idx;
-      m_curr_active_window_idx = m_subwindows.size();
-    }
-    m_subwindows.push_back(subwindow_sp);
-    ::top_panel(subwindow_sp->m_panel);
-    m_needs_update = true;
-    return subwindow_sp;
-  }
-
-  bool RemoveSubWindow(Window *window) {
-    Windows::iterator pos, end = m_subwindows.end();
-    size_t i = 0;
-    for (pos = m_subwindows.begin(); pos != end; ++pos, ++i) {
-      if ((*pos).get() == window) {
-        if (m_prev_active_window_idx == i)
-          m_prev_active_window_idx = UINT32_MAX;
-        else if (m_prev_active_window_idx != UINT32_MAX &&
-                 m_prev_active_window_idx > i)
-          --m_prev_active_window_idx;
-
-        if (m_curr_active_window_idx == i)
-          m_curr_active_window_idx = UINT32_MAX;
-        else if (m_curr_active_window_idx != UINT32_MAX &&
-                 m_curr_active_window_idx > i)
-          --m_curr_active_window_idx;
-        window->Erase();
-        m_subwindows.erase(pos);
-        m_needs_update = true;
-        if (m_parent)
-          m_parent->Touch();
-        else
-          ::touchwin(stdscr);
-        return true;
-      }
-    }
-    return false;
-  }
-
-  WindowSP FindSubWindow(const char *name) {
-    Windows::iterator pos, end = m_subwindows.end();
-    size_t i = 0;
-    for (pos = m_subwindows.begin(); pos != end; ++pos, ++i) {
-      if ((*pos)->m_name == name)
-        return *pos;
-    }
-    return WindowSP();
-  }
-
-  void RemoveSubWindows() {
-    m_curr_active_window_idx = UINT32_MAX;
-    m_prev_active_window_idx = UINT32_MAX;
-    for (Windows::iterator pos = m_subwindows.begin();
-         pos != m_subwindows.end(); pos = m_subwindows.erase(pos)) {
-      (*pos)->Erase();
-    }
-    if (m_parent)
-      m_parent->Touch();
-    else
-      ::touchwin(stdscr);
-  }
-
-  WINDOW *get() { return m_window; }
-
-  operator WINDOW *() { return m_window; }
-
-  // Window drawing utilities
-  void DrawTitleBox(const char *title, const char *bottom_message = nullptr) {
-    attr_t attr = 0;
-    if (IsActive())
-      attr = A_BOLD | COLOR_PAIR(2);
-    else
-      attr = 0;
-    if (attr)
-      AttributeOn(attr);
-
-    Box();
-    MoveCursor(3, 0);
-
-    if (title && title[0]) {
-      PutChar('<');
-      PutCString(title);
-      PutChar('>');
-    }
-
-    if (bottom_message && bottom_message[0]) {
-      int bottom_message_length = strlen(bottom_message);
-      int x = GetWidth() - 3 - (bottom_message_length + 2);
-
-      if (x > 0) {
-        MoveCursor(x, GetHeight() - 1);
-        PutChar('[');
-        PutCString(bottom_message);
-        PutChar(']');
-      } else {
-        MoveCursor(1, GetHeight() - 1);
-        PutChar('[');
-        PutCStringTruncated(bottom_message, 1);
-      }
-    }
-    if (attr)
-      AttributeOff(attr);
-  }
-
-  virtual void Draw(bool force) {
-    if (m_delegate_sp && m_delegate_sp->WindowDelegateDraw(*this, force))
-      return;
-
-    for (auto &subwindow_sp : m_subwindows)
-      subwindow_sp->Draw(force);
-  }
-
-  bool CreateHelpSubwindow() {
-    if (m_delegate_sp) {
-      const char *text = m_delegate_sp->WindowDelegateGetHelpText();
-      KeyHelp *key_help = m_delegate_sp->WindowDelegateGetKeyHelp();
-      if ((text && text[0]) || key_help) {
-        std::unique_ptr<HelpDialogDelegate> help_delegate_up(
-            new HelpDialogDelegate(text, key_help));
-        const size_t num_lines = help_delegate_up->GetNumLines();
-        const size_t max_length = help_delegate_up->GetMaxLineLength();
-        Rect bounds = GetBounds();
-        bounds.Inset(1, 1);
-        if (max_length + 4 < static_cast<size_t>(bounds.size.width)) {
-          bounds.origin.x += (bounds.size.width - max_length + 4) / 2;
-          bounds.size.width = max_length + 4;
-        } else {
-          if (bounds.size.width > 100) {
-            const int inset_w = bounds.size.width / 4;
-            bounds.origin.x += inset_w;
-            bounds.size.width -= 2 * inset_w;
-          }
-        }
-
-        if (num_lines + 2 < static_cast<size_t>(bounds.size.height)) {
-          bounds.origin.y += (bounds.size.height - num_lines + 2) / 2;
-          bounds.size.height = num_lines + 2;
-        } else {
-          if (bounds.size.height > 100) {
-            const int inset_h = bounds.size.height / 4;
-            bounds.origin.y += inset_h;
-            bounds.size.height -= 2 * inset_h;
-          }
-        }
-        WindowSP help_window_sp;
-        Window *parent_window = GetParent();
-        if (parent_window)
-          help_window_sp = parent_window->CreateSubWindow("Help", bounds, true);
-        else
-          help_window_sp = CreateSubWindow("Help", bounds, true);
-        help_window_sp->SetDelegate(
-            WindowDelegateSP(help_delegate_up.release()));
-        return true;
-      }
-    }
-    return false;
-  }
-
-  virtual HandleCharResult HandleChar(int key) {
-    // Always check the active window first
-    HandleCharResult result = eKeyNotHandled;
-    WindowSP active_window_sp = GetActiveWindow();
-    if (active_window_sp) {
-      result = active_window_sp->HandleChar(key);
-      if (result != eKeyNotHandled)
-        return result;
-    }
-
-    if (m_delegate_sp) {
-      result = m_delegate_sp->WindowDelegateHandleChar(*this, key);
-      if (result != eKeyNotHandled)
-        return result;
-    }
-
-    // Then check for any windows that want any keys that weren't handled. This
-    // is typically only for a menubar. Make a copy of the subwindows in case
-    // any HandleChar() functions muck with the subwindows. If we don't do
-    // this, we can crash when iterating over the subwindows.
-    Windows subwindows(m_subwindows);
-    for (auto subwindow_sp : subwindows) {
-      if (!subwindow_sp->m_can_activate) {
-        HandleCharResult result = subwindow_sp->HandleChar(key);
-        if (result != eKeyNotHandled)
-          return result;
-      }
-    }
-
-    return eKeyNotHandled;
-  }
-
-  WindowSP GetActiveWindow() {
-    if (!m_subwindows.empty()) {
-      if (m_curr_active_window_idx >= m_subwindows.size()) {
-        if (m_prev_active_window_idx < m_subwindows.size()) {
-          m_curr_active_window_idx = m_prev_active_window_idx;
-          m_prev_active_window_idx = UINT32_MAX;
-        } else if (IsActive()) {
-          m_prev_active_window_idx = UINT32_MAX;
-          m_curr_active_window_idx = UINT32_MAX;
-
-          // Find first window that wants to be active if this window is active
-          const size_t num_subwindows = m_subwindows.size();
-          for (size_t i = 0; i < num_subwindows; ++i) {
-            if (m_subwindows[i]->GetCanBeActive()) {
-              m_curr_active_window_idx = i;
-              break;
-            }
-          }
-        }
-      }
-
-      if (m_curr_active_window_idx < m_subwindows.size())
-        return m_subwindows[m_curr_active_window_idx];
-    }
-    return WindowSP();
-  }
-
-  bool GetCanBeActive() const { return m_can_activate; }
-
-  void SetCanBeActive(bool b) { m_can_activate = b; }
-
-  void SetDelegate(const WindowDelegateSP &delegate_sp) {
-    m_delegate_sp = delegate_sp;
-  }
-
-  Window *GetParent() const { return m_parent; }
-
-  bool IsActive() const {
-    if (m_parent)
-      return m_parent->GetActiveWindow().get() == this;
-    else
-      return true; // Top level window is always active
-  }
-
-  void SelectNextWindowAsActive() {
-    // Move active focus to next window
-    const size_t num_subwindows = m_subwindows.size();
-    if (m_curr_active_window_idx == UINT32_MAX) {
-      uint32_t idx = 0;
-      for (auto subwindow_sp : m_subwindows) {
-        if (subwindow_sp->GetCanBeActive()) {
-          m_curr_active_window_idx = idx;
-          break;
-        }
-        ++idx;
-      }
-    } else if (m_curr_active_window_idx + 1 < num_subwindows) {
-      bool handled = false;
-      m_prev_active_window_idx = m_curr_active_window_idx;
-      for (size_t idx = m_curr_active_window_idx + 1; idx < num_subwindows;
-           ++idx) {
-        if (m_subwindows[idx]->GetCanBeActive()) {
-          m_curr_active_window_idx = idx;
-          handled = true;
-          break;
-        }
-      }
-      if (!handled) {
-        for (size_t idx = 0; idx <= m_prev_active_window_idx; ++idx) {
-          if (m_subwindows[idx]->GetCanBeActive()) {
-            m_curr_active_window_idx = idx;
-            break;
-          }
-        }
-      }
-    } else {
-      m_prev_active_window_idx = m_curr_active_window_idx;
-      for (size_t idx = 0; idx < num_subwindows; ++idx) {
-        if (m_subwindows[idx]->GetCanBeActive()) {
-          m_curr_active_window_idx = idx;
-          break;
-        }
-      }
-    }
-  }
-
-  const char *GetName() const { return m_name.c_str(); }
-
-protected:
-  std::string m_name;
-  WINDOW *m_window;
-  PANEL *m_panel;
-  Window *m_parent;
-  Windows m_subwindows;
-  WindowDelegateSP m_delegate_sp;
-  uint32_t m_curr_active_window_idx;
-  uint32_t m_prev_active_window_idx;
-  bool m_delete;
-  bool m_needs_update;
-  bool m_can_activate;
-  bool m_is_subwin;
-
-private:
-  DISALLOW_COPY_AND_ASSIGN(Window);
-};
-
-class MenuDelegate {
-public:
-  virtual ~MenuDelegate() = default;
-
-  virtual MenuActionResult MenuDelegateAction(Menu &menu) = 0;
-};
-
-class Menu : public WindowDelegate {
-public:
-  enum class Type { Invalid, Bar, Item, Separator };
-
-  // Menubar or separator constructor
-  Menu(Type type);
-
-  // Menuitem constructor
-  Menu(const char *name, const char *key_name, int key_value,
-       uint64_t identifier);
-
-  ~Menu() override = default;
-
-  const MenuDelegateSP &GetDelegate() const { return m_delegate_sp; }
-
-  void SetDelegate(const MenuDelegateSP &delegate_sp) {
-    m_delegate_sp = delegate_sp;
-  }
-
-  void RecalculateNameLengths();
-
-  void AddSubmenu(const MenuSP &menu_sp);
-
-  int DrawAndRunMenu(Window &window);
-
-  void DrawMenuTitle(Window &window, bool highlight);
-
-  bool WindowDelegateDraw(Window &window, bool force) override;
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override;
-
-  MenuActionResult ActionPrivate(Menu &menu) {
-    MenuActionResult result = MenuActionResult::NotHandled;
-    if (m_delegate_sp) {
-      result = m_delegate_sp->MenuDelegateAction(menu);
-      if (result != MenuActionResult::NotHandled)
-        return result;
-    } else if (m_parent) {
-      result = m_parent->ActionPrivate(menu);
-      if (result != MenuActionResult::NotHandled)
-        return result;
-    }
-    return m_canned_result;
-  }
-
-  MenuActionResult Action() {
-    // Call the recursive action so it can try to handle it with the menu
-    // delegate, and if not, try our parent menu
-    return ActionPrivate(*this);
-  }
-
-  void SetCannedResult(MenuActionResult result) { m_canned_result = result; }
-
-  Menus &GetSubmenus() { return m_submenus; }
-
-  const Menus &GetSubmenus() const { return m_submenus; }
-
-  int GetSelectedSubmenuIndex() const { return m_selected; }
-
-  void SetSelectedSubmenuIndex(int idx) { m_selected = idx; }
-
-  Type GetType() const { return m_type; }
-
-  int GetStartingColumn() const { return m_start_col; }
-
-  void SetStartingColumn(int col) { m_start_col = col; }
-
-  int GetKeyValue() const { return m_key_value; }
-
-  std::string &GetName() { return m_name; }
-
-  int GetDrawWidth() const {
-    return m_max_submenu_name_length + m_max_submenu_key_name_length + 8;
-  }
-
-  uint64_t GetIdentifier() const { return m_identifier; }
-
-  void SetIdentifier(uint64_t identifier) { m_identifier = identifier; }
-
-protected:
-  std::string m_name;
-  std::string m_key_name;
-  uint64_t m_identifier;
-  Type m_type;
-  int m_key_value;
-  int m_start_col;
-  int m_max_submenu_name_length;
-  int m_max_submenu_key_name_length;
-  int m_selected;
-  Menu *m_parent;
-  Menus m_submenus;
-  WindowSP m_menu_window_sp;
-  MenuActionResult m_canned_result;
-  MenuDelegateSP m_delegate_sp;
-};
-
-// Menubar or separator constructor
-Menu::Menu(Type type)
-    : m_name(), m_key_name(), m_identifier(0), m_type(type), m_key_value(0),
-      m_start_col(0), m_max_submenu_name_length(0),
-      m_max_submenu_key_name_length(0), m_selected(0), m_parent(nullptr),
-      m_submenus(), m_canned_result(MenuActionResult::NotHandled),
-      m_delegate_sp() {}
-
-// Menuitem constructor
-Menu::Menu(const char *name, const char *key_name, int key_value,
-           uint64_t identifier)
-    : m_name(), m_key_name(), m_identifier(identifier), m_type(Type::Invalid),
-      m_key_value(key_value), m_start_col(0), m_max_submenu_name_length(0),
-      m_max_submenu_key_name_length(0), m_selected(0), m_parent(nullptr),
-      m_submenus(), m_canned_result(MenuActionResult::NotHandled),
-      m_delegate_sp() {
-  if (name && name[0]) {
-    m_name = name;
-    m_type = Type::Item;
-    if (key_name && key_name[0])
-      m_key_name = key_name;
-  } else {
-    m_type = Type::Separator;
-  }
-}
-
-void Menu::RecalculateNameLengths() {
-  m_max_submenu_name_length = 0;
-  m_max_submenu_key_name_length = 0;
-  Menus &submenus = GetSubmenus();
-  const size_t num_submenus = submenus.size();
-  for (size_t i = 0; i < num_submenus; ++i) {
-    Menu *submenu = submenus[i].get();
-    if (static_cast<size_t>(m_max_submenu_name_length) < submenu->m_name.size())
-      m_max_submenu_name_length = submenu->m_name.size();
-    if (static_cast<size_t>(m_max_submenu_key_name_length) <
-        submenu->m_key_name.size())
-      m_max_submenu_key_name_length = submenu->m_key_name.size();
-  }
-}
-
-void Menu::AddSubmenu(const MenuSP &menu_sp) {
-  menu_sp->m_parent = this;
-  if (static_cast<size_t>(m_max_submenu_name_length) < menu_sp->m_name.size())
-    m_max_submenu_name_length = menu_sp->m_name.size();
-  if (static_cast<size_t>(m_max_submenu_key_name_length) <
-      menu_sp->m_key_name.size())
-    m_max_submenu_key_name_length = menu_sp->m_key_name.size();
-  m_submenus.push_back(menu_sp);
-}
-
-void Menu::DrawMenuTitle(Window &window, bool highlight) {
-  if (m_type == Type::Separator) {
-    window.MoveCursor(0, window.GetCursorY());
-    window.PutChar(ACS_LTEE);
-    int width = window.GetWidth();
-    if (width > 2) {
-      width -= 2;
-      for (int i = 0; i < width; ++i)
-        window.PutChar(ACS_HLINE);
-    }
-    window.PutChar(ACS_RTEE);
-  } else {
-    const int shortcut_key = m_key_value;
-    bool underlined_shortcut = false;
-    const attr_t hilgight_attr = A_REVERSE;
-    if (highlight)
-      window.AttributeOn(hilgight_attr);
-    if (isprint(shortcut_key)) {
-      size_t lower_pos = m_name.find(tolower(shortcut_key));
-      size_t upper_pos = m_name.find(toupper(shortcut_key));
-      const char *name = m_name.c_str();
-      size_t pos = std::min<size_t>(lower_pos, upper_pos);
-      if (pos != std::string::npos) {
-        underlined_shortcut = true;
-        if (pos > 0) {
-          window.PutCString(name, pos);
-          name += pos;
-        }
-        const attr_t shortcut_attr = A_UNDERLINE | A_BOLD;
-        window.AttributeOn(shortcut_attr);
-        window.PutChar(name[0]);
-        window.AttributeOff(shortcut_attr);
-        name++;
-        if (name[0])
-          window.PutCString(name);
-      }
-    }
-
-    if (!underlined_shortcut) {
-      window.PutCString(m_name.c_str());
-    }
-
-    if (highlight)
-      window.AttributeOff(hilgight_attr);
-
-    if (m_key_name.empty()) {
-      if (!underlined_shortcut && isprint(m_key_value)) {
-        window.AttributeOn(COLOR_PAIR(3));
-        window.Printf(" (%c)", m_key_value);
-        window.AttributeOff(COLOR_PAIR(3));
-      }
-    } else {
-      window.AttributeOn(COLOR_PAIR(3));
-      window.Printf(" (%s)", m_key_name.c_str());
-      window.AttributeOff(COLOR_PAIR(3));
-    }
-  }
-}
-
-bool Menu::WindowDelegateDraw(Window &window, bool force) {
-  Menus &submenus = GetSubmenus();
-  const size_t num_submenus = submenus.size();
-  const int selected_idx = GetSelectedSubmenuIndex();
-  Menu::Type menu_type = GetType();
-  switch (menu_type) {
-  case Menu::Type::Bar: {
-    window.SetBackground(2);
-    window.MoveCursor(0, 0);
-    for (size_t i = 0; i < num_submenus; ++i) {
-      Menu *menu = submenus[i].get();
-      if (i > 0)
-        window.PutChar(' ');
-      menu->SetStartingColumn(window.GetCursorX());
-      window.PutCString("| ");
-      menu->DrawMenuTitle(window, false);
-    }
-    window.PutCString(" |");
-  } break;
-
-  case Menu::Type::Item: {
-    int y = 1;
-    int x = 3;
-    // Draw the menu
-    int cursor_x = 0;
-    int cursor_y = 0;
-    window.Erase();
-    window.SetBackground(2);
-    window.Box();
-    for (size_t i = 0; i < num_submenus; ++i) {
-      const bool is_selected = (i == static_cast<size_t>(selected_idx));
-      window.MoveCursor(x, y + i);
-      if (is_selected) {
-        // Remember where we want the cursor to be
-        cursor_x = x - 1;
-        cursor_y = y + i;
-      }
-      submenus[i]->DrawMenuTitle(window, is_selected);
-    }
-    window.MoveCursor(cursor_x, cursor_y);
-  } break;
-
-  default:
-  case Menu::Type::Separator:
-    break;
-  }
-  return true; // Drawing handled...
-}
-
-HandleCharResult Menu::WindowDelegateHandleChar(Window &window, int key) {
-  HandleCharResult result = eKeyNotHandled;
-
-  Menus &submenus = GetSubmenus();
-  const size_t num_submenus = submenus.size();
-  const int selected_idx = GetSelectedSubmenuIndex();
-  Menu::Type menu_type = GetType();
-  if (menu_type == Menu::Type::Bar) {
-    MenuSP run_menu_sp;
-    switch (key) {
-    case KEY_DOWN:
-    case KEY_UP:
-      // Show last menu or first menu
-      if (selected_idx < static_cast<int>(num_submenus))
-        run_menu_sp = submenus[selected_idx];
-      else if (!submenus.empty())
-        run_menu_sp = submenus.front();
-      result = eKeyHandled;
-      break;
-
-    case KEY_RIGHT:
-      ++m_selected;
-      if (m_selected >= static_cast<int>(num_submenus))
-        m_selected = 0;
-      if (m_selected < static_cast<int>(num_submenus))
-        run_menu_sp = submenus[m_selected];
-      else if (!submenus.empty())
-        run_menu_sp = submenus.front();
-      result = eKeyHandled;
-      break;
-
-    case KEY_LEFT:
-      --m_selected;
-      if (m_selected < 0)
-        m_selected = num_submenus - 1;
-      if (m_selected < static_cast<int>(num_submenus))
-        run_menu_sp = submenus[m_selected];
-      else if (!submenus.empty())
-        run_menu_sp = submenus.front();
-      result = eKeyHandled;
-      break;
-
-    default:
-      for (size_t i = 0; i < num_submenus; ++i) {
-        if (submenus[i]->GetKeyValue() == key) {
-          SetSelectedSubmenuIndex(i);
-          run_menu_sp = submenus[i];
-          result = eKeyHandled;
-          break;
-        }
-      }
-      break;
-    }
-
-    if (run_menu_sp) {
-      // Run the action on this menu in case we need to populate the menu with
-      // dynamic content and also in case check marks, and any other menu
-      // decorations need to be calculated
-      if (run_menu_sp->Action() == MenuActionResult::Quit)
-        return eQuitApplication;
-
-      Rect menu_bounds;
-      menu_bounds.origin.x = run_menu_sp->GetStartingColumn();
-      menu_bounds.origin.y = 1;
-      menu_bounds.size.width = run_menu_sp->GetDrawWidth();
-      menu_bounds.size.height = run_menu_sp->GetSubmenus().size() + 2;
-      if (m_menu_window_sp)
-        window.GetParent()->RemoveSubWindow(m_menu_window_sp.get());
-
-      m_menu_window_sp = window.GetParent()->CreateSubWindow(
-          run_menu_sp->GetName().c_str(), menu_bounds, true);
-      m_menu_window_sp->SetDelegate(run_menu_sp);
-    }
-  } else if (menu_type == Menu::Type::Item) {
-    switch (key) {
-    case KEY_DOWN:
-      if (m_submenus.size() > 1) {
-        const int start_select = m_selected;
-        while (++m_selected != start_select) {
-          if (static_cast<size_t>(m_selected) >= num_submenus)
-            m_selected = 0;
-          if (m_submenus[m_selected]->GetType() == Type::Separator)
-            continue;
-          else
-            break;
-        }
-        return eKeyHandled;
-      }
-      break;
-
-    case KEY_UP:
-      if (m_submenus.size() > 1) {
-        const int start_select = m_selected;
-        while (--m_selected != start_select) {
-          if (m_selected < static_cast<int>(0))
-            m_selected = num_submenus - 1;
-          if (m_submenus[m_selected]->GetType() == Type::Separator)
-            continue;
-          else
-            break;
-        }
-        return eKeyHandled;
-      }
-      break;
-
-    case KEY_RETURN:
-      if (static_cast<size_t>(selected_idx) < num_submenus) {
-        if (submenus[selected_idx]->Action() == MenuActionResult::Quit)
-          return eQuitApplication;
-        window.GetParent()->RemoveSubWindow(&window);
-        return eKeyHandled;
-      }
-      break;
-
-    case KEY_ESCAPE: // Beware: pressing escape key has 1 to 2 second delay in
-                     // case other chars are entered for escaped sequences
-      window.GetParent()->RemoveSubWindow(&window);
-      return eKeyHandled;
-
-    default:
-      for (size_t i = 0; i < num_submenus; ++i) {
-        Menu *menu = submenus[i].get();
-        if (menu->GetKeyValue() == key) {
-          SetSelectedSubmenuIndex(i);
-          window.GetParent()->RemoveSubWindow(&window);
-          if (menu->Action() == MenuActionResult::Quit)
-            return eQuitApplication;
-          return eKeyHandled;
-        }
-      }
-      break;
-    }
-  } else if (menu_type == Menu::Type::Separator) {
-  }
-  return result;
-}
-
-class Application {
-public:
-  Application(FILE *in, FILE *out)
-      : m_window_sp(), m_screen(nullptr), m_in(in), m_out(out) {}
-
-  ~Application() {
-    m_window_delegates.clear();
-    m_window_sp.reset();
-    if (m_screen) {
-      ::delscreen(m_screen);
-      m_screen = nullptr;
-    }
-  }
-
-  void Initialize() {
-    ::setlocale(LC_ALL, "");
-    ::setlocale(LC_CTYPE, "");
-    m_screen = ::newterm(nullptr, m_out, m_in);
-    ::start_color();
-    ::curs_set(0);
-    ::noecho();
-    ::keypad(stdscr, TRUE);
-  }
-
-  void Terminate() { ::endwin(); }
-
-  void Run(Debugger &debugger) {
-    bool done = false;
-    int delay_in_tenths_of_a_second = 1;
-
-    // Alas the threading model in curses is a bit lame so we need to resort to
-    // polling every 0.5 seconds. We could poll for stdin ourselves and then
-    // pass the keys down but then we need to translate all of the escape
-    // sequences ourselves. So we resort to polling for input because we need
-    // to receive async process events while in this loop.
-
-    halfdelay(delay_in_tenths_of_a_second); // Poll using some number of tenths
-                                            // of seconds seconds when calling
-                                            // Window::GetChar()
-
-    ListenerSP listener_sp(
-        Listener::MakeListener("lldb.IOHandler.curses.Application"));
-    ConstString broadcaster_class_target(Target::GetStaticBroadcasterClass());
-    ConstString broadcaster_class_process(Process::GetStaticBroadcasterClass());
-    ConstString broadcaster_class_thread(Thread::GetStaticBroadcasterClass());
-    debugger.EnableForwardEvents(listener_sp);
-
-    bool update = true;
-#if defined(__APPLE__)
-    std::deque<int> escape_chars;
-#endif
-
-    while (!done) {
-      if (update) {
-        m_window_sp->Draw(false);
-        // All windows should be calling Window::DeferredRefresh() instead of
-        // Window::Refresh() so we can do a single update and avoid any screen
-        // blinking
-        update_panels();
-
-        // Cursor hiding isn't working on MacOSX, so hide it in the top left
-        // corner
-        m_window_sp->MoveCursor(0, 0);
-
-        doupdate();
-        update = false;
-      }
-
-#if defined(__APPLE__)
-      // Terminal.app doesn't map its function keys correctly, F1-F4 default
-      // to: \033OP, \033OQ, \033OR, \033OS, so lets take care of this here if
-      // possible
-      int ch;
-      if (escape_chars.empty())
-        ch = m_window_sp->GetChar();
-      else {
-        ch = escape_chars.front();
-        escape_chars.pop_front();
-      }
-      if (ch == KEY_ESCAPE) {
-        int ch2 = m_window_sp->GetChar();
-        if (ch2 == 'O') {
-          int ch3 = m_window_sp->GetChar();
-          switch (ch3) {
-          case 'P':
-            ch = KEY_F(1);
-            break;
-          case 'Q':
-            ch = KEY_F(2);
-            break;
-          case 'R':
-            ch = KEY_F(3);
-            break;
-          case 'S':
-            ch = KEY_F(4);
-            break;
-          default:
-            escape_chars.push_back(ch2);
-            if (ch3 != -1)
-              escape_chars.push_back(ch3);
-            break;
-          }
-        } else if (ch2 != -1)
-          escape_chars.push_back(ch2);
-      }
-#else
-      int ch = m_window_sp->GetChar();
-
-#endif
-      if (ch == -1) {
-        if (feof(m_in) || ferror(m_in)) {
-          done = true;
-        } else {
-          // Just a timeout from using halfdelay(), check for events
-          EventSP event_sp;
-          while (listener_sp->PeekAtNextEvent()) {
-            listener_sp->GetEvent(event_sp, std::chrono::seconds(0));
-
-            if (event_sp) {
-              Broadcaster *broadcaster = event_sp->GetBroadcaster();
-              if (broadcaster) {
-                // uint32_t event_type = event_sp->GetType();
-                ConstString broadcaster_class(
-                    broadcaster->GetBroadcasterClass());
-                if (broadcaster_class == broadcaster_class_process) {
-                  debugger.GetCommandInterpreter().UpdateExecutionContext(
-                      nullptr);
-                  update = true;
-                  continue; // Don't get any key, just update our view
-                }
-              }
-            }
-          }
-        }
-      } else {
-        HandleCharResult key_result = m_window_sp->HandleChar(ch);
-        switch (key_result) {
-        case eKeyHandled:
-          debugger.GetCommandInterpreter().UpdateExecutionContext(nullptr);
-          update = true;
-          break;
-        case eKeyNotHandled:
-          break;
-        case eQuitApplication:
-          done = true;
-          break;
-        }
-      }
-    }
-
-    debugger.CancelForwardEvents(listener_sp);
-  }
-
-  WindowSP &GetMainWindow() {
-    if (!m_window_sp)
-      m_window_sp = std::make_shared<Window>("main", stdscr, false);
-    return m_window_sp;
-  }
-
-protected:
-  WindowSP m_window_sp;
-  WindowDelegates m_window_delegates;
-  SCREEN *m_screen;
-  FILE *m_in;
-  FILE *m_out;
-};
-
-} // namespace curses
-
-using namespace curses;
-
-struct Row {
-  ValueObjectManager value;
-  Row *parent;
-  // The process stop ID when the children were calculated.
-  uint32_t children_stop_id;
-  int row_idx;
-  int x;
-  int y;
-  bool might_have_children;
-  bool expanded;
-  bool calculated_children;
-  std::vector<Row> children;
-
-  Row(const ValueObjectSP &v, Row *p)
-      : value(v, lldb::eDynamicDontRunTarget, true), parent(p), row_idx(0),
-        x(1), y(1), might_have_children(v ? v->MightHaveChildren() : false),
-        expanded(false), calculated_children(false), children() {}
-
-  size_t GetDepth() const {
-    if (parent)
-      return 1 + parent->GetDepth();
-    return 0;
-  }
-
-  void Expand() { expanded = true; }
-
-  std::vector<Row> &GetChildren() {
-    ProcessSP process_sp = value.GetProcessSP();
-    auto stop_id = process_sp->GetStopID();
-    if (process_sp && stop_id != children_stop_id) {
-      children_stop_id = stop_id;
-      calculated_children = false;
-    }
-    if (!calculated_children) {
-      children.clear();
-      calculated_children = true;
-      ValueObjectSP valobj = value.GetSP();
-      if (valobj) {
-        const size_t num_children = valobj->GetNumChildren();
-        for (size_t i = 0; i < num_children; ++i) {
-          children.push_back(Row(valobj->GetChildAtIndex(i, true), this));
-        }
-      }
-    }
-    return children;
-  }
-
-  void Unexpand() {
-    expanded = false;
-    calculated_children = false;
-    children.clear();
-  }
-
-  void DrawTree(Window &window) {
-    if (parent)
-      parent->DrawTreeForChild(window, this, 0);
-
-    if (might_have_children) {
-      // It we can get UTF8 characters to work we should try to use the
-      // "symbol" UTF8 string below
-      //            const char *symbol = "";
-      //            if (row.expanded)
-      //                symbol = "\xe2\x96\xbd ";
-      //            else
-      //                symbol = "\xe2\x96\xb7 ";
-      //            window.PutCString (symbol);
-
-      // The ACS_DARROW and ACS_RARROW don't look very nice they are just a 'v'
-      // or '>' character...
-      //            if (expanded)
-      //                window.PutChar (ACS_DARROW);
-      //            else
-      //                window.PutChar (ACS_RARROW);
-      // Since we can't find any good looking right arrow/down arrow symbols,
-      // just use a diamond...
-      window.PutChar(ACS_DIAMOND);
-      window.PutChar(ACS_HLINE);
-    }
-  }
-
-  void DrawTreeForChild(Window &window, Row *child, uint32_t reverse_depth) {
-    if (parent)
-      parent->DrawTreeForChild(window, this, reverse_depth + 1);
-
-    if (&GetChildren().back() == child) {
-      // Last child
-      if (reverse_depth == 0) {
-        window.PutChar(ACS_LLCORNER);
-        window.PutChar(ACS_HLINE);
-      } else {
-        window.PutChar(' ');
-        window.PutChar(' ');
-      }
-    } else {
-      if (reverse_depth == 0) {
-        window.PutChar(ACS_LTEE);
-        window.PutChar(ACS_HLINE);
-      } else {
-        window.PutChar(ACS_VLINE);
-        window.PutChar(' ');
-      }
-    }
-  }
-};
-
-struct DisplayOptions {
-  bool show_types;
-};
-
-class TreeItem;
-
-class TreeDelegate {
-public:
-  TreeDelegate() = default;
-  virtual ~TreeDelegate() = default;
-
-  virtual void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) = 0;
-  virtual void TreeDelegateGenerateChildren(TreeItem &item) = 0;
-  virtual bool TreeDelegateItemSelected(
-      TreeItem &item) = 0; // Return true if we need to update views
-};
-
-typedef std::shared_ptr<TreeDelegate> TreeDelegateSP;
-
-class TreeItem {
-public:
-  TreeItem(TreeItem *parent, TreeDelegate &delegate, bool might_have_children)
-      : m_parent(parent), m_delegate(delegate), m_user_data(nullptr),
-        m_identifier(0), m_row_idx(-1), m_children(),
-        m_might_have_children(might_have_children), m_is_expanded(false) {}
-
-  TreeItem &operator=(const TreeItem &rhs) {
-    if (this != &rhs) {
-      m_parent = rhs.m_parent;
-      m_delegate = rhs.m_delegate;
-      m_user_data = rhs.m_user_data;
-      m_identifier = rhs.m_identifier;
-      m_row_idx = rhs.m_row_idx;
-      m_children = rhs.m_children;
-      m_might_have_children = rhs.m_might_have_children;
-      m_is_expanded = rhs.m_is_expanded;
-    }
-    return *this;
-  }
-
-  size_t GetDepth() const {
-    if (m_parent)
-      return 1 + m_parent->GetDepth();
-    return 0;
-  }
-
-  int GetRowIndex() const { return m_row_idx; }
-
-  void ClearChildren() { m_children.clear(); }
-
-  void Resize(size_t n, const TreeItem &t) { m_children.resize(n, t); }
-
-  TreeItem &operator[](size_t i) { return m_children[i]; }
-
-  void SetRowIndex(int row_idx) { m_row_idx = row_idx; }
-
-  size_t GetNumChildren() {
-    m_delegate.TreeDelegateGenerateChildren(*this);
-    return m_children.size();
-  }
-
-  void ItemWasSelected() { m_delegate.TreeDelegateItemSelected(*this); }
-
-  void CalculateRowIndexes(int &row_idx) {
-    SetRowIndex(row_idx);
-    ++row_idx;
-
-    const bool expanded = IsExpanded();
-
-    // The root item must calculate its children, or we must calculate the
-    // number of children if the item is expanded
-    if (m_parent == nullptr || expanded)
-      GetNumChildren();
-
-    for (auto &item : m_children) {
-      if (expanded)
-        item.CalculateRowIndexes(row_idx);
-      else
-        item.SetRowIndex(-1);
-    }
-  }
-
-  TreeItem *GetParent() { return m_parent; }
-
-  bool IsExpanded() const { return m_is_expanded; }
-
-  void Expand() { m_is_expanded = true; }
-
-  void Unexpand() { m_is_expanded = false; }
-
-  bool Draw(Window &window, const int first_visible_row,
-            const uint32_t selected_row_idx, int &row_idx, int &num_rows_left) {
-    if (num_rows_left <= 0)
-      return false;
-
-    if (m_row_idx >= first_visible_row) {
-      window.MoveCursor(2, row_idx + 1);
-
-      if (m_parent)
-        m_parent->DrawTreeForChild(window, this, 0);
-
-      if (m_might_have_children) {
-        // It we can get UTF8 characters to work we should try to use the
-        // "symbol" UTF8 string below
-        //            const char *symbol = "";
-        //            if (row.expanded)
-        //                symbol = "\xe2\x96\xbd ";
-        //            else
-        //                symbol = "\xe2\x96\xb7 ";
-        //            window.PutCString (symbol);
-
-        // The ACS_DARROW and ACS_RARROW don't look very nice they are just a
-        // 'v' or '>' character...
-        //            if (expanded)
-        //                window.PutChar (ACS_DARROW);
-        //            else
-        //                window.PutChar (ACS_RARROW);
-        // Since we can't find any good looking right arrow/down arrow symbols,
-        // just use a diamond...
-        window.PutChar(ACS_DIAMOND);
-        window.PutChar(ACS_HLINE);
-      }
-      bool highlight = (selected_row_idx == static_cast<size_t>(m_row_idx)) &&
-                       window.IsActive();
-
-      if (highlight)
-        window.AttributeOn(A_REVERSE);
-
-      m_delegate.TreeDelegateDrawTreeItem(*this, window);
-
-      if (highlight)
-        window.AttributeOff(A_REVERSE);
-      ++row_idx;
-      --num_rows_left;
-    }
-
-    if (num_rows_left <= 0)
-      return false; // We are done drawing...
-
-    if (IsExpanded()) {
-      for (auto &item : m_children) {
-        // If we displayed all the rows and item.Draw() returns false we are
-        // done drawing and can exit this for loop
-        if (!item.Draw(window, first_visible_row, selected_row_idx, row_idx,
-                       num_rows_left))
-          break;
-      }
-    }
-    return num_rows_left >= 0; // Return true if not done drawing yet
-  }
-
-  void DrawTreeForChild(Window &window, TreeItem *child,
-                        uint32_t reverse_depth) {
-    if (m_parent)
-      m_parent->DrawTreeForChild(window, this, reverse_depth + 1);
-
-    if (&m_children.back() == child) {
-      // Last child
-      if (reverse_depth == 0) {
-        window.PutChar(ACS_LLCORNER);
-        window.PutChar(ACS_HLINE);
-      } else {
-        window.PutChar(' ');
-        window.PutChar(' ');
-      }
-    } else {
-      if (reverse_depth == 0) {
-        window.PutChar(ACS_LTEE);
-        window.PutChar(ACS_HLINE);
-      } else {
-        window.PutChar(ACS_VLINE);
-        window.PutChar(' ');
-      }
-    }
-  }
-
-  TreeItem *GetItemForRowIndex(uint32_t row_idx) {
-    if (static_cast<uint32_t>(m_row_idx) == row_idx)
-      return this;
-    if (m_children.empty())
-      return nullptr;
-    if (IsExpanded()) {
-      for (auto &item : m_children) {
-        TreeItem *selected_item_ptr = item.GetItemForRowIndex(row_idx);
-        if (selected_item_ptr)
-          return selected_item_ptr;
-      }
-    }
-    return nullptr;
-  }
-
-  void *GetUserData() const { return m_user_data; }
-
-  void SetUserData(void *user_data) { m_user_data = user_data; }
-
-  uint64_t GetIdentifier() const { return m_identifier; }
-
-  void SetIdentifier(uint64_t identifier) { m_identifier = identifier; }
-
-  void SetMightHaveChildren(bool b) { m_might_have_children = b; }
-
-protected:
-  TreeItem *m_parent;
-  TreeDelegate &m_delegate;
-  void *m_user_data;
-  uint64_t m_identifier;
-  int m_row_idx; // Zero based visible row index, -1 if not visible or for the
-                 // root item
-  std::vector<TreeItem> m_children;
-  bool m_might_have_children;
-  bool m_is_expanded;
-};
-
-class TreeWindowDelegate : public WindowDelegate {
-public:
-  TreeWindowDelegate(Debugger &debugger, const TreeDelegateSP &delegate_sp)
-      : m_debugger(debugger), m_delegate_sp(delegate_sp),
-        m_root(nullptr, *delegate_sp, true), m_selected_item(nullptr),
-        m_num_rows(0), m_selected_row_idx(0), m_first_visible_row(0),
-        m_min_x(0), m_min_y(0), m_max_x(0), m_max_y(0) {}
-
-  int NumVisibleRows() const { return m_max_y - m_min_y; }
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx(
-        m_debugger.GetCommandInterpreter().GetExecutionContext());
-    Process *process = exe_ctx.GetProcessPtr();
-
-    bool display_content = false;
-    if (process) {
-      StateType state = process->GetState();
-      if (StateIsStoppedState(state, true)) {
-        // We are stopped, so it is ok to
-        display_content = true;
-      } else if (StateIsRunningState(state)) {
-        return true; // Don't do any updating when we are running
-      }
-    }
-
-    m_min_x = 2;
-    m_min_y = 1;
-    m_max_x = window.GetWidth() - 1;
-    m_max_y = window.GetHeight() - 1;
-
-    window.Erase();
-    window.DrawTitleBox(window.GetName());
-
-    if (display_content) {
-      const int num_visible_rows = NumVisibleRows();
-      m_num_rows = 0;
-      m_root.CalculateRowIndexes(m_num_rows);
-
-      // If we unexpanded while having something selected our total number of
-      // rows is less than the num visible rows, then make sure we show all the
-      // rows by setting the first visible row accordingly.
-      if (m_first_visible_row > 0 && m_num_rows < num_visible_rows)
-        m_first_visible_row = 0;
-
-      // Make sure the selected row is always visible
-      if (m_selected_row_idx < m_first_visible_row)
-        m_first_visible_row = m_selected_row_idx;
-      else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
-        m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
-
-      int row_idx = 0;
-      int num_rows_left = num_visible_rows;
-      m_root.Draw(window, m_first_visible_row, m_selected_row_idx, row_idx,
-                  num_rows_left);
-      // Get the selected row
-      m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-    } else {
-      m_selected_item = nullptr;
-    }
-
-    return true; // Drawing handled
-  }
-
-  const char *WindowDelegateGetHelpText() override {
-    return "Thread window keyboard shortcuts:";
-  }
-
-  KeyHelp *WindowDelegateGetKeyHelp() override {
-    static curses::KeyHelp g_source_view_key_help[] = {
-        {KEY_UP, "Select previous item"},
-        {KEY_DOWN, "Select next item"},
-        {KEY_RIGHT, "Expand the selected item"},
-        {KEY_LEFT,
-         "Unexpand the selected item or select parent if not expanded"},
-        {KEY_PPAGE, "Page up"},
-        {KEY_NPAGE, "Page down"},
-        {'h', "Show help dialog"},
-        {' ', "Toggle item expansion"},
-        {',', "Page up"},
-        {'.', "Page down"},
-        {'\0', nullptr}};
-    return g_source_view_key_help;
-  }
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
-    switch (c) {
-    case ',':
-    case KEY_PPAGE:
-      // Page up key
-      if (m_first_visible_row > 0) {
-        if (m_first_visible_row > m_max_y)
-          m_first_visible_row -= m_max_y;
-        else
-          m_first_visible_row = 0;
-        m_selected_row_idx = m_first_visible_row;
-        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-        if (m_selected_item)
-          m_selected_item->ItemWasSelected();
-      }
-      return eKeyHandled;
-
-    case '.':
-    case KEY_NPAGE:
-      // Page down key
-      if (m_num_rows > m_max_y) {
-        if (m_first_visible_row + m_max_y < m_num_rows) {
-          m_first_visible_row += m_max_y;
-          m_selected_row_idx = m_first_visible_row;
-          m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-          if (m_selected_item)
-            m_selected_item->ItemWasSelected();
-        }
-      }
-      return eKeyHandled;
-
-    case KEY_UP:
-      if (m_selected_row_idx > 0) {
-        --m_selected_row_idx;
-        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-        if (m_selected_item)
-          m_selected_item->ItemWasSelected();
-      }
-      return eKeyHandled;
-
-    case KEY_DOWN:
-      if (m_selected_row_idx + 1 < m_num_rows) {
-        ++m_selected_row_idx;
-        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-        if (m_selected_item)
-          m_selected_item->ItemWasSelected();
-      }
-      return eKeyHandled;
-
-    case KEY_RIGHT:
-      if (m_selected_item) {
-        if (!m_selected_item->IsExpanded())
-          m_selected_item->Expand();
-      }
-      return eKeyHandled;
-
-    case KEY_LEFT:
-      if (m_selected_item) {
-        if (m_selected_item->IsExpanded())
-          m_selected_item->Unexpand();
-        else if (m_selected_item->GetParent()) {
-          m_selected_row_idx = m_selected_item->GetParent()->GetRowIndex();
-          m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
-          if (m_selected_item)
-            m_selected_item->ItemWasSelected();
-        }
-      }
-      return eKeyHandled;
-
-    case ' ':
-      // Toggle expansion state when SPACE is pressed
-      if (m_selected_item) {
-        if (m_selected_item->IsExpanded())
-          m_selected_item->Unexpand();
-        else
-          m_selected_item->Expand();
-      }
-      return eKeyHandled;
-
-    case 'h':
-      window.CreateHelpSubwindow();
-      return eKeyHandled;
-
-    default:
-      break;
-    }
-    return eKeyNotHandled;
-  }
-
-protected:
-  Debugger &m_debugger;
-  TreeDelegateSP m_delegate_sp;
-  TreeItem m_root;
-  TreeItem *m_selected_item;
-  int m_num_rows;
-  int m_selected_row_idx;
-  int m_first_visible_row;
-  int m_min_x;
-  int m_min_y;
-  int m_max_x;
-  int m_max_y;
-};
-
-class FrameTreeDelegate : public TreeDelegate {
-public:
-  FrameTreeDelegate() : TreeDelegate() {
-    FormatEntity::Parse(
-        "frame #${frame.index}: {${function.name}${function.pc-offset}}}",
-        m_format);
-  }
-
-  ~FrameTreeDelegate() override = default;
-
-  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
-    Thread *thread = (Thread *)item.GetUserData();
-    if (thread) {
-      const uint64_t frame_idx = item.GetIdentifier();
-      StackFrameSP frame_sp = thread->GetStackFrameAtIndex(frame_idx);
-      if (frame_sp) {
-        StreamString strm;
-        const SymbolContext &sc =
-            frame_sp->GetSymbolContext(eSymbolContextEverything);
-        ExecutionContext exe_ctx(frame_sp);
-        if (FormatEntity::Format(m_format, strm, &sc, &exe_ctx, nullptr,
-                                 nullptr, false, false)) {
-          int right_pad = 1;
-          window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
-        }
-      }
-    }
-  }
-
-  void TreeDelegateGenerateChildren(TreeItem &item) override {
-    // No children for frames yet...
-  }
-
-  bool TreeDelegateItemSelected(TreeItem &item) override {
-    Thread *thread = (Thread *)item.GetUserData();
-    if (thread) {
-      thread->GetProcess()->GetThreadList().SetSelectedThreadByID(
-          thread->GetID());
-      const uint64_t frame_idx = item.GetIdentifier();
-      thread->SetSelectedFrameByIndex(frame_idx);
-      return true;
-    }
-    return false;
-  }
-
-protected:
-  FormatEntity::Entry m_format;
-};
-
-class ThreadTreeDelegate : public TreeDelegate {
-public:
-  ThreadTreeDelegate(Debugger &debugger)
-      : TreeDelegate(), m_debugger(debugger), m_tid(LLDB_INVALID_THREAD_ID),
-        m_stop_id(UINT32_MAX) {
-    FormatEntity::Parse("thread #${thread.index}: tid = ${thread.id}{, stop "
-                        "reason = ${thread.stop-reason}}",
-                        m_format);
-  }
-
-  ~ThreadTreeDelegate() override = default;
-
-  ProcessSP GetProcess() {
-    return m_debugger.GetCommandInterpreter()
-        .GetExecutionContext()
-        .GetProcessSP();
-  }
-
-  ThreadSP GetThread(const TreeItem &item) {
-    ProcessSP process_sp = GetProcess();
-    if (process_sp)
-      return process_sp->GetThreadList().FindThreadByID(item.GetIdentifier());
-    return ThreadSP();
-  }
-
-  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
-    ThreadSP thread_sp = GetThread(item);
-    if (thread_sp) {
-      StreamString strm;
-      ExecutionContext exe_ctx(thread_sp);
-      if (FormatEntity::Format(m_format, strm, nullptr, &exe_ctx, nullptr,
-                               nullptr, false, false)) {
-        int right_pad = 1;
-        window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
-      }
-    }
-  }
-
-  void TreeDelegateGenerateChildren(TreeItem &item) override {
-    ProcessSP process_sp = GetProcess();
-    if (process_sp && process_sp->IsAlive()) {
-      StateType state = process_sp->GetState();
-      if (StateIsStoppedState(state, true)) {
-        ThreadSP thread_sp = GetThread(item);
-        if (thread_sp) {
-          if (m_stop_id == process_sp->GetStopID() &&
-              thread_sp->GetID() == m_tid)
-            return; // Children are already up to date
-          if (!m_frame_delegate_sp) {
-            // Always expand the thread item the first time we show it
-            m_frame_delegate_sp = std::make_shared<FrameTreeDelegate>();
-          }
-
-          m_stop_id = process_sp->GetStopID();
-          m_tid = thread_sp->GetID();
-
-          TreeItem t(&item, *m_frame_delegate_sp, false);
-          size_t num_frames = thread_sp->GetStackFrameCount();
-          item.Resize(num_frames, t);
-          for (size_t i = 0; i < num_frames; ++i) {
-            item[i].SetUserData(thread_sp.get());
-            item[i].SetIdentifier(i);
-          }
-        }
-        return;
-      }
-    }
-    item.ClearChildren();
-  }
-
-  bool TreeDelegateItemSelected(TreeItem &item) override {
-    ProcessSP process_sp = GetProcess();
-    if (process_sp && process_sp->IsAlive()) {
-      StateType state = process_sp->GetState();
-      if (StateIsStoppedState(state, true)) {
-        ThreadSP thread_sp = GetThread(item);
-        if (thread_sp) {
-          ThreadList &thread_list = thread_sp->GetProcess()->GetThreadList();
-          std::lock_guard<std::recursive_mutex> guard(thread_list.GetMutex());
-          ThreadSP selected_thread_sp = thread_list.GetSelectedThread();
-          if (selected_thread_sp->GetID() != thread_sp->GetID()) {
-            thread_list.SetSelectedThreadByID(thread_sp->GetID());
-            return true;
-          }
-        }
-      }
-    }
-    return false;
-  }
-
-protected:
-  Debugger &m_debugger;
-  std::shared_ptr<FrameTreeDelegate> m_frame_delegate_sp;
-  lldb::user_id_t m_tid;
-  uint32_t m_stop_id;
-  FormatEntity::Entry m_format;
-};
-
-class ThreadsTreeDelegate : public TreeDelegate {
-public:
-  ThreadsTreeDelegate(Debugger &debugger)
-      : TreeDelegate(), m_thread_delegate_sp(), m_debugger(debugger),
-        m_stop_id(UINT32_MAX) {
-    FormatEntity::Parse("process ${process.id}{, name = ${process.name}}",
-                        m_format);
-  }
-
-  ~ThreadsTreeDelegate() override = default;
-
-  ProcessSP GetProcess() {
-    return m_debugger.GetCommandInterpreter()
-        .GetExecutionContext()
-        .GetProcessSP();
-  }
-
-  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
-    ProcessSP process_sp = GetProcess();
-    if (process_sp && process_sp->IsAlive()) {
-      StreamString strm;
-      ExecutionContext exe_ctx(process_sp);
-      if (FormatEntity::Format(m_format, strm, nullptr, &exe_ctx, nullptr,
-                               nullptr, false, false)) {
-        int right_pad = 1;
-        window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
-      }
-    }
-  }
-
-  void TreeDelegateGenerateChildren(TreeItem &item) override {
-    ProcessSP process_sp = GetProcess();
-    if (process_sp && process_sp->IsAlive()) {
-      StateType state = process_sp->GetState();
-      if (StateIsStoppedState(state, true)) {
-        const uint32_t stop_id = process_sp->GetStopID();
-        if (m_stop_id == stop_id)
-          return; // Children are already up to date
-
-        m_stop_id = stop_id;
-
-        if (!m_thread_delegate_sp) {
-          // Always expand the thread item the first time we show it
-          // item.Expand();
-          m_thread_delegate_sp =
-              std::make_shared<ThreadTreeDelegate>(m_debugger);
-        }
-
-        TreeItem t(&item, *m_thread_delegate_sp, false);
-        ThreadList &threads = process_sp->GetThreadList();
-        std::lock_guard<std::recursive_mutex> guard(threads.GetMutex());
-        size_t num_threads = threads.GetSize();
-        item.Resize(num_threads, t);
-        for (size_t i = 0; i < num_threads; ++i) {
-          item[i].SetIdentifier(threads.GetThreadAtIndex(i)->GetID());
-          item[i].SetMightHaveChildren(true);
-        }
-        return;
-      }
-    }
-    item.ClearChildren();
-  }
-
-  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
-
-protected:
-  std::shared_ptr<ThreadTreeDelegate> m_thread_delegate_sp;
-  Debugger &m_debugger;
-  uint32_t m_stop_id;
-  FormatEntity::Entry m_format;
-};
-
-class ValueObjectListDelegate : public WindowDelegate {
-public:
-  ValueObjectListDelegate()
-      : m_rows(), m_selected_row(nullptr), m_selected_row_idx(0),
-        m_first_visible_row(0), m_num_rows(0), m_max_x(0), m_max_y(0) {}
-
-  ValueObjectListDelegate(ValueObjectList &valobj_list)
-      : m_rows(), m_selected_row(nullptr), m_selected_row_idx(0),
-        m_first_visible_row(0), m_num_rows(0), m_max_x(0), m_max_y(0) {
-    SetValues(valobj_list);
-  }
-
-  ~ValueObjectListDelegate() override = default;
-
-  void SetValues(ValueObjectList &valobj_list) {
-    m_selected_row = nullptr;
-    m_selected_row_idx = 0;
-    m_first_visible_row = 0;
-    m_num_rows = 0;
-    m_rows.clear();
-    for (auto &valobj_sp : valobj_list.GetObjects())
-      m_rows.push_back(Row(valobj_sp, nullptr));
-  }
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    m_num_rows = 0;
-    m_min_x = 2;
-    m_min_y = 1;
-    m_max_x = window.GetWidth() - 1;
-    m_max_y = window.GetHeight() - 1;
-
-    window.Erase();
-    window.DrawTitleBox(window.GetName());
-
-    const int num_visible_rows = NumVisibleRows();
-    const int num_rows = CalculateTotalNumberRows(m_rows);
-
-    // If we unexpanded while having something selected our total number of
-    // rows is less than the num visible rows, then make sure we show all the
-    // rows by setting the first visible row accordingly.
-    if (m_first_visible_row > 0 && num_rows < num_visible_rows)
-      m_first_visible_row = 0;
-
-    // Make sure the selected row is always visible
-    if (m_selected_row_idx < m_first_visible_row)
-      m_first_visible_row = m_selected_row_idx;
-    else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
-      m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
-
-    DisplayRows(window, m_rows, g_options);
-
-    // Get the selected row
-    m_selected_row = GetRowForRowIndex(m_selected_row_idx);
-    // Keep the cursor on the selected row so the highlight and the cursor are
-    // always on the same line
-    if (m_selected_row)
-      window.MoveCursor(m_selected_row->x, m_selected_row->y);
-
-    return true; // Drawing handled
-  }
-
-  KeyHelp *WindowDelegateGetKeyHelp() override {
-    static curses::KeyHelp g_source_view_key_help[] = {
-        {KEY_UP, "Select previous item"},
-        {KEY_DOWN, "Select next item"},
-        {KEY_RIGHT, "Expand selected item"},
-        {KEY_LEFT, "Unexpand selected item or select parent if not expanded"},
-        {KEY_PPAGE, "Page up"},
-        {KEY_NPAGE, "Page down"},
-        {'A', "Format as annotated address"},
-        {'b', "Format as binary"},
-        {'B', "Format as hex bytes with ASCII"},
-        {'c', "Format as character"},
-        {'d', "Format as a signed integer"},
-        {'D', "Format selected value using the default format for the type"},
-        {'f', "Format as float"},
-        {'h', "Show help dialog"},
-        {'i', "Format as instructions"},
-        {'o', "Format as octal"},
-        {'p', "Format as pointer"},
-        {'s', "Format as C string"},
-        {'t', "Toggle showing/hiding type names"},
-        {'u', "Format as an unsigned integer"},
-        {'x', "Format as hex"},
-        {'X', "Format as uppercase hex"},
-        {' ', "Toggle item expansion"},
-        {',', "Page up"},
-        {'.', "Page down"},
-        {'\0', nullptr}};
-    return g_source_view_key_help;
-  }
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
-    switch (c) {
-    case 'x':
-    case 'X':
-    case 'o':
-    case 's':
-    case 'u':
-    case 'd':
-    case 'D':
-    case 'i':
-    case 'A':
-    case 'p':
-    case 'c':
-    case 'b':
-    case 'B':
-    case 'f':
-      // Change the format for the currently selected item
-      if (m_selected_row) {
-        auto valobj_sp = m_selected_row->value.GetSP();
-        if (valobj_sp)
-          valobj_sp->SetFormat(FormatForChar(c));
-      }
-      return eKeyHandled;
-
-    case 't':
-      // Toggle showing type names
-      g_options.show_types = !g_options.show_types;
-      return eKeyHandled;
-
-    case ',':
-    case KEY_PPAGE:
-      // Page up key
-      if (m_first_visible_row > 0) {
-        if (static_cast<int>(m_first_visible_row) > m_max_y)
-          m_first_visible_row -= m_max_y;
-        else
-          m_first_visible_row = 0;
-        m_selected_row_idx = m_first_visible_row;
-      }
-      return eKeyHandled;
-
-    case '.':
-    case KEY_NPAGE:
-      // Page down key
-      if (m_num_rows > static_cast<size_t>(m_max_y)) {
-        if (m_first_visible_row + m_max_y < m_num_rows) {
-          m_first_visible_row += m_max_y;
-          m_selected_row_idx = m_first_visible_row;
-        }
-      }
-      return eKeyHandled;
-
-    case KEY_UP:
-      if (m_selected_row_idx > 0)
-        --m_selected_row_idx;
-      return eKeyHandled;
-
-    case KEY_DOWN:
-      if (m_selected_row_idx + 1 < m_num_rows)
-        ++m_selected_row_idx;
-      return eKeyHandled;
-
-    case KEY_RIGHT:
-      if (m_selected_row) {
-        if (!m_selected_row->expanded)
-          m_selected_row->Expand();
-      }
-      return eKeyHandled;
-
-    case KEY_LEFT:
-      if (m_selected_row) {
-        if (m_selected_row->expanded)
-          m_selected_row->Unexpand();
-        else if (m_selected_row->parent)
-          m_selected_row_idx = m_selected_row->parent->row_idx;
-      }
-      return eKeyHandled;
-
-    case ' ':
-      // Toggle expansion state when SPACE is pressed
-      if (m_selected_row) {
-        if (m_selected_row->expanded)
-          m_selected_row->Unexpand();
-        else
-          m_selected_row->Expand();
-      }
-      return eKeyHandled;
-
-    case 'h':
-      window.CreateHelpSubwindow();
-      return eKeyHandled;
-
-    default:
-      break;
-    }
-    return eKeyNotHandled;
-  }
-
-protected:
-  std::vector<Row> m_rows;
-  Row *m_selected_row;
-  uint32_t m_selected_row_idx;
-  uint32_t m_first_visible_row;
-  uint32_t m_num_rows;
-  int m_min_x;
-  int m_min_y;
-  int m_max_x;
-  int m_max_y;
-
-  static Format FormatForChar(int c) {
-    switch (c) {
-    case 'x':
-      return eFormatHex;
-    case 'X':
-      return eFormatHexUppercase;
-    case 'o':
-      return eFormatOctal;
-    case 's':
-      return eFormatCString;
-    case 'u':
-      return eFormatUnsigned;
-    case 'd':
-      return eFormatDecimal;
-    case 'D':
-      return eFormatDefault;
-    case 'i':
-      return eFormatInstruction;
-    case 'A':
-      return eFormatAddressInfo;
-    case 'p':
-      return eFormatPointer;
-    case 'c':
-      return eFormatChar;
-    case 'b':
-      return eFormatBinary;
-    case 'B':
-      return eFormatBytesWithASCII;
-    case 'f':
-      return eFormatFloat;
-    }
-    return eFormatDefault;
-  }
-
-  bool DisplayRowObject(Window &window, Row &row, DisplayOptions &options,
-                        bool highlight, bool last_child) {
-    ValueObject *valobj = row.value.GetSP().get();
-
-    if (valobj == nullptr)
-      return false;
-
-    const char *type_name =
-        options.show_types ? valobj->GetTypeName().GetCString() : nullptr;
-    const char *name = valobj->GetName().GetCString();
-    const char *value = valobj->GetValueAsCString();
-    const char *summary = valobj->GetSummaryAsCString();
-
-    window.MoveCursor(row.x, row.y);
-
-    row.DrawTree(window);
-
-    if (highlight)
-      window.AttributeOn(A_REVERSE);
-
-    if (type_name && type_name[0])
-      window.Printf("(%s) ", type_name);
-
-    if (name && name[0])
-      window.PutCString(name);
-
-    attr_t changd_attr = 0;
-    if (valobj->GetValueDidChange())
-      changd_attr = COLOR_PAIR(5) | A_BOLD;
-
-    if (value && value[0]) {
-      window.PutCString(" = ");
-      if (changd_attr)
-        window.AttributeOn(changd_attr);
-      window.PutCString(value);
-      if (changd_attr)
-        window.AttributeOff(changd_attr);
-    }
-
-    if (summary && summary[0]) {
-      window.PutChar(' ');
-      if (changd_attr)
-        window.AttributeOn(changd_attr);
-      window.PutCString(summary);
-      if (changd_attr)
-        window.AttributeOff(changd_attr);
-    }
-
-    if (highlight)
-      window.AttributeOff(A_REVERSE);
-
-    return true;
-  }
-
-  void DisplayRows(Window &window, std::vector<Row> &rows,
-                   DisplayOptions &options) {
-    // >   0x25B7
-    // \/  0x25BD
-
-    bool window_is_active = window.IsActive();
-    for (auto &row : rows) {
-      const bool last_child = row.parent && &rows[rows.size() - 1] == &row;
-      // Save the row index in each Row structure
-      row.row_idx = m_num_rows;
-      if ((m_num_rows >= m_first_visible_row) &&
-          ((m_num_rows - m_first_visible_row) <
-           static_cast<size_t>(NumVisibleRows()))) {
-        row.x = m_min_x;
-        row.y = m_num_rows - m_first_visible_row + 1;
-        if (DisplayRowObject(window, row, options,
-                             window_is_active &&
-                                 m_num_rows == m_selected_row_idx,
-                             last_child)) {
-          ++m_num_rows;
-        } else {
-          row.x = 0;
-          row.y = 0;
-        }
-      } else {
-        row.x = 0;
-        row.y = 0;
-        ++m_num_rows;
-      }
-
-      auto &children = row.GetChildren();
-      if (row.expanded && !children.empty()) {
-        DisplayRows(window, children, options);
-      }
-    }
-  }
-
-  int CalculateTotalNumberRows(std::vector<Row> &rows) {
-    int row_count = 0;
-    for (auto &row : rows) {
-      ++row_count;
-      if (row.expanded)
-        row_count += CalculateTotalNumberRows(row.GetChildren());
-    }
-    return row_count;
-  }
-
-  static Row *GetRowForRowIndexImpl(std::vector<Row> &rows, size_t &row_index) {
-    for (auto &row : rows) {
-      if (row_index == 0)
-        return &row;
-      else {
-        --row_index;
-        auto &children = row.GetChildren();
-        if (row.expanded && !children.empty()) {
-          Row *result = GetRowForRowIndexImpl(children, row_index);
-          if (result)
-            return result;
-        }
-      }
-    }
-    return nullptr;
-  }
-
-  Row *GetRowForRowIndex(size_t row_index) {
-    return GetRowForRowIndexImpl(m_rows, row_index);
-  }
-
-  int NumVisibleRows() const { return m_max_y - m_min_y; }
-
-  static DisplayOptions g_options;
-};
-
-class FrameVariablesWindowDelegate : public ValueObjectListDelegate {
-public:
-  FrameVariablesWindowDelegate(Debugger &debugger)
-      : ValueObjectListDelegate(), m_debugger(debugger),
-        m_frame_block(nullptr) {}
-
-  ~FrameVariablesWindowDelegate() override = default;
-
-  const char *WindowDelegateGetHelpText() override {
-    return "Frame variable window keyboard shortcuts:";
-  }
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx(
-        m_debugger.GetCommandInterpreter().GetExecutionContext());
-    Process *process = exe_ctx.GetProcessPtr();
-    Block *frame_block = nullptr;
-    StackFrame *frame = nullptr;
-
-    if (process) {
-      StateType state = process->GetState();
-      if (StateIsStoppedState(state, true)) {
-        frame = exe_ctx.GetFramePtr();
-        if (frame)
-          frame_block = frame->GetFrameBlock();
-      } else if (StateIsRunningState(state)) {
-        return true; // Don't do any updating when we are running
-      }
-    }
-
-    ValueObjectList local_values;
-    if (frame_block) {
-      // Only update the variables if they have changed
-      if (m_frame_block != frame_block) {
-        m_frame_block = frame_block;
-
-        VariableList *locals = frame->GetVariableList(true);
-        if (locals) {
-          const DynamicValueType use_dynamic = eDynamicDontRunTarget;
-          for (const VariableSP &local_sp : *locals) {
-            ValueObjectSP value_sp =
-                frame->GetValueObjectForFrameVariable(local_sp, use_dynamic);
-            if (value_sp) {
-              ValueObjectSP synthetic_value_sp = value_sp->GetSyntheticValue();
-              if (synthetic_value_sp)
-                local_values.Append(synthetic_value_sp);
-              else
-                local_values.Append(value_sp);
-            }
-          }
-          // Update the values
-          SetValues(local_values);
-        }
-      }
-    } else {
-      m_frame_block = nullptr;
-      // Update the values with an empty list if there is no frame
-      SetValues(local_values);
-    }
-
-    return ValueObjectListDelegate::WindowDelegateDraw(window, force);
-  }
-
-protected:
-  Debugger &m_debugger;
-  Block *m_frame_block;
-};
-
-class RegistersWindowDelegate : public ValueObjectListDelegate {
-public:
-  RegistersWindowDelegate(Debugger &debugger)
-      : ValueObjectListDelegate(), m_debugger(debugger) {}
-
-  ~RegistersWindowDelegate() override = default;
-
-  const char *WindowDelegateGetHelpText() override {
-    return "Register window keyboard shortcuts:";
-  }
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx(
-        m_debugger.GetCommandInterpreter().GetExecutionContext());
-    StackFrame *frame = exe_ctx.GetFramePtr();
-
-    ValueObjectList value_list;
-    if (frame) {
-      if (frame->GetStackID() != m_stack_id) {
-        m_stack_id = frame->GetStackID();
-        RegisterContextSP reg_ctx(frame->GetRegisterContext());
-        if (reg_ctx) {
-          const uint32_t num_sets = reg_ctx->GetRegisterSetCount();
-          for (uint32_t set_idx = 0; set_idx < num_sets; ++set_idx) {
-            value_list.Append(
-                ValueObjectRegisterSet::Create(frame, reg_ctx, set_idx));
-          }
-        }
-        SetValues(value_list);
-      }
-    } else {
-      Process *process = exe_ctx.GetProcessPtr();
-      if (process && process->IsAlive())
-        return true; // Don't do any updating if we are running
-      else {
-        // Update the values with an empty list if there is no process or the
-        // process isn't alive anymore
-        SetValues(value_list);
-      }
-    }
-    return ValueObjectListDelegate::WindowDelegateDraw(window, force);
-  }
-
-protected:
-  Debugger &m_debugger;
-  StackID m_stack_id;
-};
-
-static const char *CursesKeyToCString(int ch) {
-  static char g_desc[32];
-  if (ch >= KEY_F0 && ch < KEY_F0 + 64) {
-    snprintf(g_desc, sizeof(g_desc), "F%u", ch - KEY_F0);
-    return g_desc;
-  }
-  switch (ch) {
-  case KEY_DOWN:
-    return "down";
-  case KEY_UP:
-    return "up";
-  case KEY_LEFT:
-    return "left";
-  case KEY_RIGHT:
-    return "right";
-  case KEY_HOME:
-    return "home";
-  case KEY_BACKSPACE:
-    return "backspace";
-  case KEY_DL:
-    return "delete-line";
-  case KEY_IL:
-    return "insert-line";
-  case KEY_DC:
-    return "delete-char";
-  case KEY_IC:
-    return "insert-char";
-  case KEY_CLEAR:
-    return "clear";
-  case KEY_EOS:
-    return "clear-to-eos";
-  case KEY_EOL:
-    return "clear-to-eol";
-  case KEY_SF:
-    return "scroll-forward";
-  case KEY_SR:
-    return "scroll-backward";
-  case KEY_NPAGE:
-    return "page-down";
-  case KEY_PPAGE:
-    return "page-up";
-  case KEY_STAB:
-    return "set-tab";
-  case KEY_CTAB:
-    return "clear-tab";
-  case KEY_CATAB:
-    return "clear-all-tabs";
-  case KEY_ENTER:
-    return "enter";
-  case KEY_PRINT:
-    return "print";
-  case KEY_LL:
-    return "lower-left key";
-  case KEY_A1:
-    return "upper left of keypad";
-  case KEY_A3:
-    return "upper right of keypad";
-  case KEY_B2:
-    return "center of keypad";
-  case KEY_C1:
-    return "lower left of keypad";
-  case KEY_C3:
-    return "lower right of keypad";
-  case KEY_BTAB:
-    return "back-tab key";
-  case KEY_BEG:
-    return "begin key";
-  case KEY_CANCEL:
-    return "cancel key";
-  case KEY_CLOSE:
-    return "close key";
-  case KEY_COMMAND:
-    return "command key";
-  case KEY_COPY:
-    return "copy key";
-  case KEY_CREATE:
-    return "create key";
-  case KEY_END:
-    return "end key";
-  case KEY_EXIT:
-    return "exit key";
-  case KEY_FIND:
-    return "find key";
-  case KEY_HELP:
-    return "help key";
-  case KEY_MARK:
-    return "mark key";
-  case KEY_MESSAGE:
-    return "message key";
-  case KEY_MOVE:
-    return "move key";
-  case KEY_NEXT:
-    return "next key";
-  case KEY_OPEN:
-    return "open key";
-  case KEY_OPTIONS:
-    return "options key";
-  case KEY_PREVIOUS:
-    return "previous key";
-  case KEY_REDO:
-    return "redo key";
-  case KEY_REFERENCE:
-    return "reference key";
-  case KEY_REFRESH:
-    return "refresh key";
-  case KEY_REPLACE:
-    return "replace key";
-  case KEY_RESTART:
-    return "restart key";
-  case KEY_RESUME:
-    return "resume key";
-  case KEY_SAVE:
-    return "save key";
-  case KEY_SBEG:
-    return "shifted begin key";
-  case KEY_SCANCEL:
-    return "shifted cancel key";
-  case KEY_SCOMMAND:
-    return "shifted command key";
-  case KEY_SCOPY:
-    return "shifted copy key";
-  case KEY_SCREATE:
-    return "shifted create key";
-  case KEY_SDC:
-    return "shifted delete-character key";
-  case KEY_SDL:
-    return "shifted delete-line key";
-  case KEY_SELECT:
-    return "select key";
-  case KEY_SEND:
-    return "shifted end key";
-  case KEY_SEOL:
-    return "shifted clear-to-end-of-line key";
-  case KEY_SEXIT:
-    return "shifted exit key";
-  case KEY_SFIND:
-    return "shifted find key";
-  case KEY_SHELP:
-    return "shifted help key";
-  case KEY_SHOME:
-    return "shifted home key";
-  case KEY_SIC:
-    return "shifted insert-character key";
-  case KEY_SLEFT:
-    return "shifted left-arrow key";
-  case KEY_SMESSAGE:
-    return "shifted message key";
-  case KEY_SMOVE:
-    return "shifted move key";
-  case KEY_SNEXT:
-    return "shifted next key";
-  case KEY_SOPTIONS:
-    return "shifted options key";
-  case KEY_SPREVIOUS:
-    return "shifted previous key";
-  case KEY_SPRINT:
-    return "shifted print key";
-  case KEY_SREDO:
-    return "shifted redo key";
-  case KEY_SREPLACE:
-    return "shifted replace key";
-  case KEY_SRIGHT:
-    return "shifted right-arrow key";
-  case KEY_SRSUME:
-    return "shifted resume key";
-  case KEY_SSAVE:
-    return "shifted save key";
-  case KEY_SSUSPEND:
-    return "shifted suspend key";
-  case KEY_SUNDO:
-    return "shifted undo key";
-  case KEY_SUSPEND:
-    return "suspend key";
-  case KEY_UNDO:
-    return "undo key";
-  case KEY_MOUSE:
-    return "Mouse event has occurred";
-  case KEY_RESIZE:
-    return "Terminal resize event";
-#ifdef KEY_EVENT
-  case KEY_EVENT:
-    return "We were interrupted by an event";
-#endif
-  case KEY_RETURN:
-    return "return";
-  case ' ':
-    return "space";
-  case '\t':
-    return "tab";
-  case KEY_ESCAPE:
-    return "escape";
-  default:
-    if (isprint(ch))
-      snprintf(g_desc, sizeof(g_desc), "%c", ch);
-    else
-      snprintf(g_desc, sizeof(g_desc), "\\x%2.2x", ch);
-    return g_desc;
-  }
-  return nullptr;
-}
-
-HelpDialogDelegate::HelpDialogDelegate(const char *text,
-                                       KeyHelp *key_help_array)
-    : m_text(), m_first_visible_line(0) {
-  if (text && text[0]) {
-    m_text.SplitIntoLines(text);
-    m_text.AppendString("");
-  }
-  if (key_help_array) {
-    for (KeyHelp *key = key_help_array; key->ch; ++key) {
-      StreamString key_description;
-      key_description.Printf("%10s - %s", CursesKeyToCString(key->ch),
-                             key->description);
-      m_text.AppendString(key_description.GetString());
-    }
-  }
-}
-
-HelpDialogDelegate::~HelpDialogDelegate() = default;
-
-bool HelpDialogDelegate::WindowDelegateDraw(Window &window, bool force) {
-  window.Erase();
-  const int window_height = window.GetHeight();
-  int x = 2;
-  int y = 1;
-  const int min_y = y;
-  const int max_y = window_height - 1 - y;
-  const size_t num_visible_lines = max_y - min_y + 1;
-  const size_t num_lines = m_text.GetSize();
-  const char *bottom_message;
-  if (num_lines <= num_visible_lines)
-    bottom_message = "Press any key to exit";
-  else
-    bottom_message = "Use arrows to scroll, any other key to exit";
-  window.DrawTitleBox(window.GetName(), bottom_message);
-  while (y <= max_y) {
-    window.MoveCursor(x, y);
-    window.PutCStringTruncated(
-        m_text.GetStringAtIndex(m_first_visible_line + y - min_y), 1);
-    ++y;
-  }
-  return true;
-}
-
-HandleCharResult HelpDialogDelegate::WindowDelegateHandleChar(Window &window,
-                                                              int key) {
-  bool done = false;
-  const size_t num_lines = m_text.GetSize();
-  const size_t num_visible_lines = window.GetHeight() - 2;
-
-  if (num_lines <= num_visible_lines) {
-    done = true;
-    // If we have all lines visible and don't need scrolling, then any key
-    // press will cause us to exit
-  } else {
-    switch (key) {
-    case KEY_UP:
-      if (m_first_visible_line > 0)
-        --m_first_visible_line;
-      break;
-
-    case KEY_DOWN:
-      if (m_first_visible_line + num_visible_lines < num_lines)
-        ++m_first_visible_line;
-      break;
-
-    case KEY_PPAGE:
-    case ',':
-      if (m_first_visible_line > 0) {
-        if (static_cast<size_t>(m_first_visible_line) >= num_visible_lines)
-          m_first_visible_line -= num_visible_lines;
-        else
-          m_first_visible_line = 0;
-      }
-      break;
-
-    case KEY_NPAGE:
-    case '.':
-      if (m_first_visible_line + num_visible_lines < num_lines) {
-        m_first_visible_line += num_visible_lines;
-        if (static_cast<size_t>(m_first_visible_line) > num_lines)
-          m_first_visible_line = num_lines - num_visible_lines;
-      }
-      break;
-
-    default:
-      done = true;
-      break;
-    }
-  }
-  if (done)
-    window.GetParent()->RemoveSubWindow(&window);
-  return eKeyHandled;
-}
-
-class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
-public:
-  enum {
-    eMenuID_LLDB = 1,
-    eMenuID_LLDBAbout,
-    eMenuID_LLDBExit,
-
-    eMenuID_Target,
-    eMenuID_TargetCreate,
-    eMenuID_TargetDelete,
-
-    eMenuID_Process,
-    eMenuID_ProcessAttach,
-    eMenuID_ProcessDetach,
-    eMenuID_ProcessLaunch,
-    eMenuID_ProcessContinue,
-    eMenuID_ProcessHalt,
-    eMenuID_ProcessKill,
-
-    eMenuID_Thread,
-    eMenuID_ThreadStepIn,
-    eMenuID_ThreadStepOver,
-    eMenuID_ThreadStepOut,
-
-    eMenuID_View,
-    eMenuID_ViewBacktrace,
-    eMenuID_ViewRegisters,
-    eMenuID_ViewSource,
-    eMenuID_ViewVariables,
-
-    eMenuID_Help,
-    eMenuID_HelpGUIHelp
-  };
-
-  ApplicationDelegate(Application &app, Debugger &debugger)
-      : WindowDelegate(), MenuDelegate(), m_app(app), m_debugger(debugger) {}
-
-  ~ApplicationDelegate() override = default;
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    return false; // Drawing not handled, let standard window drawing happen
-  }
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override {
-    switch (key) {
-    case '\t':
-      window.SelectNextWindowAsActive();
-      return eKeyHandled;
-
-    case 'h':
-      window.CreateHelpSubwindow();
-      return eKeyHandled;
-
-    case KEY_ESCAPE:
-      return eQuitApplication;
-
-    default:
-      break;
-    }
-    return eKeyNotHandled;
-  }
-
-  const char *WindowDelegateGetHelpText() override {
-    return "Welcome to the LLDB curses GUI.\n\n"
-           "Press the TAB key to change the selected view.\n"
-           "Each view has its own keyboard shortcuts, press 'h' to open a "
-           "dialog to display them.\n\n"
-           "Common key bindings for all views:";
-  }
-
-  KeyHelp *WindowDelegateGetKeyHelp() override {
-    static curses::KeyHelp g_source_view_key_help[] = {
-        {'\t', "Select next view"},
-        {'h', "Show help dialog with view specific key bindings"},
-        {',', "Page up"},
-        {'.', "Page down"},
-        {KEY_UP, "Select previous"},
-        {KEY_DOWN, "Select next"},
-        {KEY_LEFT, "Unexpand or select parent"},
-        {KEY_RIGHT, "Expand"},
-        {KEY_PPAGE, "Page up"},
-        {KEY_NPAGE, "Page down"},
-        {'\0', nullptr}};
-    return g_source_view_key_help;
-  }
-
-  MenuActionResult MenuDelegateAction(Menu &menu) override {
-    switch (menu.GetIdentifier()) {
-    case eMenuID_ThreadStepIn: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasThreadScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive() &&
-            StateIsStoppedState(process->GetState(), true))
-          exe_ctx.GetThreadRef().StepIn(true);
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ThreadStepOut: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasThreadScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive() &&
-            StateIsStoppedState(process->GetState(), true))
-          exe_ctx.GetThreadRef().StepOut();
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ThreadStepOver: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasThreadScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive() &&
-            StateIsStoppedState(process->GetState(), true))
-          exe_ctx.GetThreadRef().StepOver(true);
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ProcessContinue: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasProcessScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive() &&
-            StateIsStoppedState(process->GetState(), true))
-          process->Resume();
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ProcessKill: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasProcessScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive())
-          process->Destroy(false);
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ProcessHalt: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasProcessScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive())
-          process->Halt();
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ProcessDetach: {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasProcessScope()) {
-        Process *process = exe_ctx.GetProcessPtr();
-        if (process && process->IsAlive())
-          process->Detach(false);
-      }
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_Process: {
-      // Populate the menu with all of the threads if the process is stopped
-      // when the Process menu gets selected and is about to display its
-      // submenu.
-      Menus &submenus = menu.GetSubmenus();
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      Process *process = exe_ctx.GetProcessPtr();
-      if (process && process->IsAlive() &&
-          StateIsStoppedState(process->GetState(), true)) {
-        if (submenus.size() == 7)
-          menu.AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
-        else if (submenus.size() > 8)
-          submenus.erase(submenus.begin() + 8, submenus.end());
-
-        ThreadList &threads = process->GetThreadList();
-        std::lock_guard<std::recursive_mutex> guard(threads.GetMutex());
-        size_t num_threads = threads.GetSize();
-        for (size_t i = 0; i < num_threads; ++i) {
-          ThreadSP thread_sp = threads.GetThreadAtIndex(i);
-          char menu_char = '\0';
-          if (i < 9)
-            menu_char = '1' + i;
-          StreamString thread_menu_title;
-          thread_menu_title.Printf("Thread %u", thread_sp->GetIndexID());
-          const char *thread_name = thread_sp->GetName();
-          if (thread_name && thread_name[0])
-            thread_menu_title.Printf(" %s", thread_name);
-          else {
-            const char *queue_name = thread_sp->GetQueueName();
-            if (queue_name && queue_name[0])
-              thread_menu_title.Printf(" %s", queue_name);
-          }
-          menu.AddSubmenu(
-              MenuSP(new Menu(thread_menu_title.GetString().str().c_str(),
-                              nullptr, menu_char, thread_sp->GetID())));
-        }
-      } else if (submenus.size() > 7) {
-        // Remove the separator and any other thread submenu items that were
-        // previously added
-        submenus.erase(submenus.begin() + 7, submenus.end());
-      }
-      // Since we are adding and removing items we need to recalculate the name
-      // lengths
-      menu.RecalculateNameLengths();
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ViewVariables: {
-      WindowSP main_window_sp = m_app.GetMainWindow();
-      WindowSP source_window_sp = main_window_sp->FindSubWindow("Source");
-      WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables");
-      WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers");
-      const Rect source_bounds = source_window_sp->GetBounds();
-
-      if (variables_window_sp) {
-        const Rect variables_bounds = variables_window_sp->GetBounds();
-
-        main_window_sp->RemoveSubWindow(variables_window_sp.get());
-
-        if (registers_window_sp) {
-          // We have a registers window, so give all the area back to the
-          // registers window
-          Rect registers_bounds = variables_bounds;
-          registers_bounds.size.width = source_bounds.size.width;
-          registers_window_sp->SetBounds(registers_bounds);
-        } else {
-          // We have no registers window showing so give the bottom area back
-          // to the source view
-          source_window_sp->Resize(source_bounds.size.width,
-                                   source_bounds.size.height +
-                                       variables_bounds.size.height);
-        }
-      } else {
-        Rect new_variables_rect;
-        if (registers_window_sp) {
-          // We have a registers window so split the area of the registers
-          // window into two columns where the left hand side will be the
-          // variables and the right hand side will be the registers
-          const Rect variables_bounds = registers_window_sp->GetBounds();
-          Rect new_registers_rect;
-          variables_bounds.VerticalSplitPercentage(0.50, new_variables_rect,
-                                                   new_registers_rect);
-          registers_window_sp->SetBounds(new_registers_rect);
-        } else {
-          // No variables window, grab the bottom part of the source window
-          Rect new_source_rect;
-          source_bounds.HorizontalSplitPercentage(0.70, new_source_rect,
-                                                  new_variables_rect);
-          source_window_sp->SetBounds(new_source_rect);
-        }
-        WindowSP new_window_sp = main_window_sp->CreateSubWindow(
-            "Variables", new_variables_rect, false);
-        new_window_sp->SetDelegate(
-            WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger)));
-      }
-      touchwin(stdscr);
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_ViewRegisters: {
-      WindowSP main_window_sp = m_app.GetMainWindow();
-      WindowSP source_window_sp = main_window_sp->FindSubWindow("Source");
-      WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables");
-      WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers");
-      const Rect source_bounds = source_window_sp->GetBounds();
-
-      if (registers_window_sp) {
-        if (variables_window_sp) {
-          const Rect variables_bounds = variables_window_sp->GetBounds();
-
-          // We have a variables window, so give all the area back to the
-          // variables window
-          variables_window_sp->Resize(variables_bounds.size.width +
-                                          registers_window_sp->GetWidth(),
-                                      variables_bounds.size.height);
-        } else {
-          // We have no variables window showing so give the bottom area back
-          // to the source view
-          source_window_sp->Resize(source_bounds.size.width,
-                                   source_bounds.size.height +
-                                       registers_window_sp->GetHeight());
-        }
-        main_window_sp->RemoveSubWindow(registers_window_sp.get());
-      } else {
-        Rect new_regs_rect;
-        if (variables_window_sp) {
-          // We have a variables window, split it into two columns where the
-          // left hand side will be the variables and the right hand side will
-          // be the registers
-          const Rect variables_bounds = variables_window_sp->GetBounds();
-          Rect new_vars_rect;
-          variables_bounds.VerticalSplitPercentage(0.50, new_vars_rect,
-                                                   new_regs_rect);
-          variables_window_sp->SetBounds(new_vars_rect);
-        } else {
-          // No registers window, grab the bottom part of the source window
-          Rect new_source_rect;
-          source_bounds.HorizontalSplitPercentage(0.70, new_source_rect,
-                                                  new_regs_rect);
-          source_window_sp->SetBounds(new_source_rect);
-        }
-        WindowSP new_window_sp =
-            main_window_sp->CreateSubWindow("Registers", new_regs_rect, false);
-        new_window_sp->SetDelegate(
-            WindowDelegateSP(new RegistersWindowDelegate(m_debugger)));
-      }
-      touchwin(stdscr);
-    }
-      return MenuActionResult::Handled;
-
-    case eMenuID_HelpGUIHelp:
-      m_app.GetMainWindow()->CreateHelpSubwindow();
-      return MenuActionResult::Handled;
-
-    default:
-      break;
-    }
-
-    return MenuActionResult::NotHandled;
-  }
-
-protected:
-  Application &m_app;
-  Debugger &m_debugger;
-};
-
-class StatusBarWindowDelegate : public WindowDelegate {
-public:
-  StatusBarWindowDelegate(Debugger &debugger) : m_debugger(debugger) {
-    FormatEntity::Parse("Thread: ${thread.id%tid}", m_format);
-  }
-
-  ~StatusBarWindowDelegate() override = default;
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx =
-        m_debugger.GetCommandInterpreter().GetExecutionContext();
-    Process *process = exe_ctx.GetProcessPtr();
-    Thread *thread = exe_ctx.GetThreadPtr();
-    StackFrame *frame = exe_ctx.GetFramePtr();
-    window.Erase();
-    window.SetBackground(2);
-    window.MoveCursor(0, 0);
-    if (process) {
-      const StateType state = process->GetState();
-      window.Printf("Process: %5" PRIu64 " %10s", process->GetID(),
-                    StateAsCString(state));
-
-      if (StateIsStoppedState(state, true)) {
-        StreamString strm;
-        if (thread && FormatEntity::Format(m_format, strm, nullptr, &exe_ctx,
-                                           nullptr, nullptr, false, false)) {
-          window.MoveCursor(40, 0);
-          window.PutCStringTruncated(strm.GetString().str().c_str(), 1);
-        }
-
-        window.MoveCursor(60, 0);
-        if (frame)
-          window.Printf("Frame: %3u  PC = 0x%16.16" PRIx64,
-                        frame->GetFrameIndex(),
-                        frame->GetFrameCodeAddress().GetOpcodeLoadAddress(
-                            exe_ctx.GetTargetPtr()));
-      } else if (state == eStateExited) {
-        const char *exit_desc = process->GetExitDescription();
-        const int exit_status = process->GetExitStatus();
-        if (exit_desc && exit_desc[0])
-          window.Printf(" with status = %i (%s)", exit_status, exit_desc);
-        else
-          window.Printf(" with status = %i", exit_status);
-      }
-    }
-    return true;
-  }
-
-protected:
-  Debugger &m_debugger;
-  FormatEntity::Entry m_format;
-};
-
-class SourceFileWindowDelegate : public WindowDelegate {
-public:
-  SourceFileWindowDelegate(Debugger &debugger)
-      : WindowDelegate(), m_debugger(debugger), m_sc(), m_file_sp(),
-        m_disassembly_scope(nullptr), m_disassembly_sp(), m_disassembly_range(),
-        m_title(), m_line_width(4), m_selected_line(0), m_pc_line(0),
-        m_stop_id(0), m_frame_idx(UINT32_MAX), m_first_visible_line(0),
-        m_min_x(0), m_min_y(0), m_max_x(0), m_max_y(0) {}
-
-  ~SourceFileWindowDelegate() override = default;
-
-  void Update(const SymbolContext &sc) { m_sc = sc; }
-
-  uint32_t NumVisibleLines() const { return m_max_y - m_min_y; }
-
-  const char *WindowDelegateGetHelpText() override {
-    return "Source/Disassembly window keyboard shortcuts:";
-  }
-
-  KeyHelp *WindowDelegateGetKeyHelp() override {
-    static curses::KeyHelp g_source_view_key_help[] = {
-        {KEY_RETURN, "Run to selected line with one shot breakpoint"},
-        {KEY_UP, "Select previous source line"},
-        {KEY_DOWN, "Select next source line"},
-        {KEY_PPAGE, "Page up"},
-        {KEY_NPAGE, "Page down"},
-        {'b', "Set breakpoint on selected source/disassembly line"},
-        {'c', "Continue process"},
-        {'d', "Detach and resume process"},
-        {'D', "Detach with process suspended"},
-        {'h', "Show help dialog"},
-        {'k', "Kill process"},
-        {'n', "Step over (source line)"},
-        {'N', "Step over (single instruction)"},
-        {'o', "Step out"},
-        {'s', "Step in (source line)"},
-        {'S', "Step in (single instruction)"},
-        {',', "Page up"},
-        {'.', "Page down"},
-        {'\0', nullptr}};
-    return g_source_view_key_help;
-  }
-
-  bool WindowDelegateDraw(Window &window, bool force) override {
-    ExecutionContext exe_ctx =
-        m_debugger.GetCommandInterpreter().GetExecutionContext();
-    Process *process = exe_ctx.GetProcessPtr();
-    Thread *thread = nullptr;
-
-    bool update_location = false;
-    if (process) {
-      StateType state = process->GetState();
-      if (StateIsStoppedState(state, true)) {
-        // We are stopped, so it is ok to
-        update_location = true;
-      }
-    }
-
-    m_min_x = 1;
-    m_min_y = 2;
-    m_max_x = window.GetMaxX() - 1;
-    m_max_y = window.GetMaxY() - 1;
-
-    const uint32_t num_visible_lines = NumVisibleLines();
-    StackFrameSP frame_sp;
-    bool set_selected_line_to_pc = false;
-
-    if (update_location) {
-      const bool process_alive = process ? process->IsAlive() : false;
-      bool thread_changed = false;
-      if (process_alive) {
-        thread = exe_ctx.GetThreadPtr();
-        if (thread) {
-          frame_sp = thread->GetSelectedFrame();
-          auto tid = thread->GetID();
-          thread_changed = tid != m_tid;
-          m_tid = tid;
-        } else {
-          if (m_tid != LLDB_INVALID_THREAD_ID) {
-            thread_changed = true;
-            m_tid = LLDB_INVALID_THREAD_ID;
-          }
-        }
-      }
-      const uint32_t stop_id = process ? process->GetStopID() : 0;
-      const bool stop_id_changed = stop_id != m_stop_id;
-      bool frame_changed = false;
-      m_stop_id = stop_id;
-      m_title.Clear();
-      if (frame_sp) {
-        m_sc = frame_sp->GetSymbolContext(eSymbolContextEverything);
-        if (m_sc.module_sp) {
-          m_title.Printf(
-              "%s", m_sc.module_sp->GetFileSpec().GetFilename().GetCString());
-          ConstString func_name = m_sc.GetFunctionName();
-          if (func_name)
-            m_title.Printf("`%s", func_name.GetCString());
-        }
-        const uint32_t frame_idx = frame_sp->GetFrameIndex();
-        frame_changed = frame_idx != m_frame_idx;
-        m_frame_idx = frame_idx;
-      } else {
-        m_sc.Clear(true);
-        frame_changed = m_frame_idx != UINT32_MAX;
-        m_frame_idx = UINT32_MAX;
-      }
-
-      const bool context_changed =
-          thread_changed || frame_changed || stop_id_changed;
-
-      if (process_alive) {
-        if (m_sc.line_entry.IsValid()) {
-          m_pc_line = m_sc.line_entry.line;
-          if (m_pc_line != UINT32_MAX)
-            --m_pc_line; // Convert to zero based line number...
-          // Update the selected line if the stop ID changed...
-          if (context_changed)
-            m_selected_line = m_pc_line;
-
-          if (m_file_sp && m_file_sp->FileSpecMatches(m_sc.line_entry.file)) {
-            // Same file, nothing to do, we should either have the lines or not
-            // (source file missing)
-            if (m_selected_line >= static_cast<size_t>(m_first_visible_line)) {
-              if (m_selected_line >= m_first_visible_line + num_visible_lines)
-                m_first_visible_line = m_selected_line - 10;
-            } else {
-              if (m_selected_line > 10)
-                m_first_visible_line = m_selected_line - 10;
-              else
-                m_first_visible_line = 0;
-            }
-          } else {
-            // File changed, set selected line to the line with the PC
-            m_selected_line = m_pc_line;
-            m_file_sp =
-                m_debugger.GetSourceManager().GetFile(m_sc.line_entry.file);
-            if (m_file_sp) {
-              const size_t num_lines = m_file_sp->GetNumLines();
-              m_line_width = 1;
-              for (size_t n = num_lines; n >= 10; n = n / 10)
-                ++m_line_width;
-
-              if (num_lines < num_visible_lines ||
-                  m_selected_line < num_visible_lines)
-                m_first_visible_line = 0;
-              else
-                m_first_visible_line = m_selected_line - 10;
-            }
-          }
-        } else {
-          m_file_sp.reset();
-        }
-
-        if (!m_file_sp || m_file_sp->GetNumLines() == 0) {
-          // Show disassembly
-          bool prefer_file_cache = false;
-          if (m_sc.function) {
-            if (m_disassembly_scope != m_sc.function) {
-              m_disassembly_scope = m_sc.function;
-              m_disassembly_sp = m_sc.function->GetInstructions(
-                  exe_ctx, nullptr, prefer_file_cache);
-              if (m_disassembly_sp) {
-                set_selected_line_to_pc = true;
-                m_disassembly_range = m_sc.function->GetAddressRange();
-              } else {
-                m_disassembly_range.Clear();
-              }
-            } else {
-              set_selected_line_to_pc = context_changed;
-            }
-          } else if (m_sc.symbol) {
-            if (m_disassembly_scope != m_sc.symbol) {
-              m_disassembly_scope = m_sc.symbol;
-              m_disassembly_sp = m_sc.symbol->GetInstructions(
-                  exe_ctx, nullptr, prefer_file_cache);
-              if (m_disassembly_sp) {
-                set_selected_line_to_pc = true;
-                m_disassembly_range.GetBaseAddress() =
-                    m_sc.symbol->GetAddress();
-                m_disassembly_range.SetByteSize(m_sc.symbol->GetByteSize());
-              } else {
-                m_disassembly_range.Clear();
-              }
-            } else {
-              set_selected_line_to_pc = context_changed;
-            }
-          }
-        }
-      } else {
-        m_pc_line = UINT32_MAX;
-      }
-    }
-
-    const int window_width = window.GetWidth();
-    window.Erase();
-    window.DrawTitleBox("Sources");
-    if (!m_title.GetString().empty()) {
-      window.AttributeOn(A_REVERSE);
-      window.MoveCursor(1, 1);
-      window.PutChar(' ');
-      window.PutCStringTruncated(m_title.GetString().str().c_str(), 1);
-      int x = window.GetCursorX();
-      if (x < window_width - 1) {
-        window.Printf("%*s", window_width - x - 1, "");
-      }
-      window.AttributeOff(A_REVERSE);
-    }
-
-    Target *target = exe_ctx.GetTargetPtr();
-    const size_t num_source_lines = GetNumSourceLines();
-    if (num_source_lines > 0) {
-      // Display source
-      BreakpointLines bp_lines;
-      if (target) {
-        BreakpointList &bp_list = target->GetBreakpointList();
-        const size_t num_bps = bp_list.GetSize();
-        for (size_t bp_idx = 0; bp_idx < num_bps; ++bp_idx) {
-          BreakpointSP bp_sp = bp_list.GetBreakpointAtIndex(bp_idx);
-          const size_t num_bps_locs = bp_sp->GetNumLocations();
-          for (size_t bp_loc_idx = 0; bp_loc_idx < num_bps_locs; ++bp_loc_idx) {
-            BreakpointLocationSP bp_loc_sp =
-                bp_sp->GetLocationAtIndex(bp_loc_idx);
-            LineEntry bp_loc_line_entry;
-            if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
-                    bp_loc_line_entry)) {
-              if (m_file_sp->GetFileSpec() == bp_loc_line_entry.file) {
-                bp_lines.insert(bp_loc_line_entry.line);
-              }
-            }
-          }
-        }
-      }
-
-      const attr_t selected_highlight_attr = A_REVERSE;
-      const attr_t pc_highlight_attr = COLOR_PAIR(1);
-
-      for (size_t i = 0; i < num_visible_lines; ++i) {
-        const uint32_t curr_line = m_first_visible_line + i;
-        if (curr_line < num_source_lines) {
-          const int line_y = m_min_y + i;
-          window.MoveCursor(1, line_y);
-          const bool is_pc_line = curr_line == m_pc_line;
-          const bool line_is_selected = m_selected_line == curr_line;
-          // Highlight the line as the PC line first, then if the selected line
-          // isn't the same as the PC line, highlight it differently
-          attr_t highlight_attr = 0;
-          attr_t bp_attr = 0;
-          if (is_pc_line)
-            highlight_attr = pc_highlight_attr;
-          else if (line_is_selected)
-            highlight_attr = selected_highlight_attr;
-
-          if (bp_lines.find(curr_line + 1) != bp_lines.end())
-            bp_attr = COLOR_PAIR(2);
-
-          if (bp_attr)
-            window.AttributeOn(bp_attr);
-
-          window.Printf(" %*u ", m_line_width, curr_line + 1);
-
-          if (bp_attr)
-            window.AttributeOff(bp_attr);
-
-          window.PutChar(ACS_VLINE);
-          // Mark the line with the PC with a diamond
-          if (is_pc_line)
-            window.PutChar(ACS_DIAMOND);
-          else
-            window.PutChar(' ');
-
-          if (highlight_attr)
-            window.AttributeOn(highlight_attr);
-          const uint32_t line_len =
-              m_file_sp->GetLineLength(curr_line + 1, false);
-          if (line_len > 0)
-            window.PutCString(m_file_sp->PeekLineData(curr_line + 1), line_len);
-
-          if (is_pc_line && frame_sp &&
-              frame_sp->GetConcreteFrameIndex() == 0) {
-            StopInfoSP stop_info_sp;
-            if (thread)
-              stop_info_sp = thread->GetStopInfo();
-            if (stop_info_sp) {
-              const char *stop_description = stop_info_sp->GetDescription();
-              if (stop_description && stop_description[0]) {
-                size_t stop_description_len = strlen(stop_description);
-                int desc_x = window_width - stop_description_len - 16;
-                window.Printf("%*s", desc_x - window.GetCursorX(), "");
-                // window.MoveCursor(window_width - stop_description_len - 15,
-                // line_y);
-                window.Printf("<<< Thread %u: %s ", thread->GetIndexID(),
-                              stop_description);
-              }
-            } else {
-              window.Printf("%*s", window_width - window.GetCursorX() - 1, "");
-            }
-          }
-          if (highlight_attr)
-            window.AttributeOff(highlight_attr);
-        } else {
-          break;
-        }
-      }
-    } else {
-      size_t num_disassembly_lines = GetNumDisassemblyLines();
-      if (num_disassembly_lines > 0) {
-        // Display disassembly
-        BreakpointAddrs bp_file_addrs;
-        Target *target = exe_ctx.GetTargetPtr();
-        if (target) {
-          BreakpointList &bp_list = target->GetBreakpointList();
-          const size_t num_bps = bp_list.GetSize();
-          for (size_t bp_idx = 0; bp_idx < num_bps; ++bp_idx) {
-            BreakpointSP bp_sp = bp_list.GetBreakpointAtIndex(bp_idx);
-            const size_t num_bps_locs = bp_sp->GetNumLocations();
-            for (size_t bp_loc_idx = 0; bp_loc_idx < num_bps_locs;
-                 ++bp_loc_idx) {
-              BreakpointLocationSP bp_loc_sp =
-                  bp_sp->GetLocationAtIndex(bp_loc_idx);
-              LineEntry bp_loc_line_entry;
-              const lldb::addr_t file_addr =
-                  bp_loc_sp->GetAddress().GetFileAddress();
-              if (file_addr != LLDB_INVALID_ADDRESS) {
-                if (m_disassembly_range.ContainsFileAddress(file_addr))
-                  bp_file_addrs.insert(file_addr);
-              }
-            }
-          }
-        }
-
-        const attr_t selected_highlight_attr = A_REVERSE;
-        const attr_t pc_highlight_attr = COLOR_PAIR(1);
-
-        StreamString strm;
-
-        InstructionList &insts = m_disassembly_sp->GetInstructionList();
-        Address pc_address;
-
-        if (frame_sp)
-          pc_address = frame_sp->GetFrameCodeAddress();
-        const uint32_t pc_idx =
-            pc_address.IsValid()
-                ? insts.GetIndexOfInstructionAtAddress(pc_address)
-                : UINT32_MAX;
-        if (set_selected_line_to_pc) {
-          m_selected_line = pc_idx;
-        }
-
-        const uint32_t non_visible_pc_offset = (num_visible_lines / 5);
-        if (static_cast<size_t>(m_first_visible_line) >= num_disassembly_lines)
-          m_first_visible_line = 0;
-
-        if (pc_idx < num_disassembly_lines) {
-          if (pc_idx < static_cast<uint32_t>(m_first_visible_line) ||
-              pc_idx >= m_first_visible_line + num_visible_lines)
-            m_first_visible_line = pc_idx - non_visible_pc_offset;
-        }
-
-        for (size_t i = 0; i < num_visible_lines; ++i) {
-          const uint32_t inst_idx = m_first_visible_line + i;
-          Instruction *inst = insts.GetInstructionAtIndex(inst_idx).get();
-          if (!inst)
-            break;
-
-          const int line_y = m_min_y + i;
-          window.MoveCursor(1, line_y);
-          const bool is_pc_line = frame_sp && inst_idx == pc_idx;
-          const bool line_is_selected = m_selected_line == inst_idx;
-          // Highlight the line as the PC line first, then if the selected line
-          // isn't the same as the PC line, highlight it differently
-          attr_t highlight_attr = 0;
-          attr_t bp_attr = 0;
-          if (is_pc_line)
-            highlight_attr = pc_highlight_attr;
-          else if (line_is_selected)
-            highlight_attr = selected_highlight_attr;
-
-          if (bp_file_addrs.find(inst->GetAddress().GetFileAddress()) !=
-              bp_file_addrs.end())
-            bp_attr = COLOR_PAIR(2);
-
-          if (bp_attr)
-            window.AttributeOn(bp_attr);
-
-          window.Printf(" 0x%16.16llx ",
-                        static_cast<unsigned long long>(
-                            inst->GetAddress().GetLoadAddress(target)));
-
-          if (bp_attr)
-            window.AttributeOff(bp_attr);
-
-          window.PutChar(ACS_VLINE);
-          // Mark the line with the PC with a diamond
-          if (is_pc_line)
-            window.PutChar(ACS_DIAMOND);
-          else
-            window.PutChar(' ');
-
-          if (highlight_attr)
-            window.AttributeOn(highlight_attr);
-
-          const char *mnemonic = inst->GetMnemonic(&exe_ctx);
-          const char *operands = inst->GetOperands(&exe_ctx);
-          const char *comment = inst->GetComment(&exe_ctx);
-
-          if (mnemonic != nullptr && mnemonic[0] == '\0')
-            mnemonic = nullptr;
-          if (operands != nullptr && operands[0] == '\0')
-            operands = nullptr;
-          if (comment != nullptr && comment[0] == '\0')
-            comment = nullptr;
-
-          strm.Clear();
-
-          if (mnemonic != nullptr && operands != nullptr && comment != nullptr)
-            strm.Printf("%-8s %-25s ; %s", mnemonic, operands, comment);
-          else if (mnemonic != nullptr && operands != nullptr)
-            strm.Printf("%-8s %s", mnemonic, operands);
-          else if (mnemonic != nullptr)
-            strm.Printf("%s", mnemonic);
-
-          int right_pad = 1;
-          window.PutCStringTruncated(strm.GetData(), right_pad);
-
-          if (is_pc_line && frame_sp &&
-              frame_sp->GetConcreteFrameIndex() == 0) {
-            StopInfoSP stop_info_sp;
-            if (thread)
-              stop_info_sp = thread->GetStopInfo();
-            if (stop_info_sp) {
-              const char *stop_description = stop_info_sp->GetDescription();
-              if (stop_description && stop_description[0]) {
-                size_t stop_description_len = strlen(stop_description);
-                int desc_x = window_width - stop_description_len - 16;
-                window.Printf("%*s", desc_x - window.GetCursorX(), "");
-                // window.MoveCursor(window_width - stop_description_len - 15,
-                // line_y);
-                window.Printf("<<< Thread %u: %s ", thread->GetIndexID(),
-                              stop_description);
-              }
-            } else {
-              window.Printf("%*s", window_width - window.GetCursorX() - 1, "");
-            }
-          }
-          if (highlight_attr)
-            window.AttributeOff(highlight_attr);
-        }
-      }
-    }
-    return true; // Drawing handled
-  }
-
-  size_t GetNumLines() {
-    size_t num_lines = GetNumSourceLines();
-    if (num_lines == 0)
-      num_lines = GetNumDisassemblyLines();
-    return num_lines;
-  }
-
-  size_t GetNumSourceLines() const {
-    if (m_file_sp)
-      return m_file_sp->GetNumLines();
-    return 0;
-  }
-
-  size_t GetNumDisassemblyLines() const {
-    if (m_disassembly_sp)
-      return m_disassembly_sp->GetInstructionList().GetSize();
-    return 0;
-  }
-
-  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
-    const uint32_t num_visible_lines = NumVisibleLines();
-    const size_t num_lines = GetNumLines();
-
-    switch (c) {
-    case ',':
-    case KEY_PPAGE:
-      // Page up key
-      if (static_cast<uint32_t>(m_first_visible_line) > num_visible_lines)
-        m_first_visible_line -= num_visible_lines;
-      else
-        m_first_visible_line = 0;
-      m_selected_line = m_first_visible_line;
-      return eKeyHandled;
-
-    case '.':
-    case KEY_NPAGE:
-      // Page down key
-      {
-        if (m_first_visible_line + num_visible_lines < num_lines)
-          m_first_visible_line += num_visible_lines;
-        else if (num_lines < num_visible_lines)
-          m_first_visible_line = 0;
-        else
-          m_first_visible_line = num_lines - num_visible_lines;
-        m_selected_line = m_first_visible_line;
-      }
-      return eKeyHandled;
-
-    case KEY_UP:
-      if (m_selected_line > 0) {
-        m_selected_line--;
-        if (static_cast<size_t>(m_first_visible_line) > m_selected_line)
-          m_first_visible_line = m_selected_line;
-      }
-      return eKeyHandled;
-
-    case KEY_DOWN:
-      if (m_selected_line + 1 < num_lines) {
-        m_selected_line++;
-        if (m_first_visible_line + num_visible_lines < m_selected_line)
-          m_first_visible_line++;
-      }
-      return eKeyHandled;
-
-    case '\r':
-    case '\n':
-    case KEY_ENTER:
-      // Set a breakpoint and run to the line using a one shot breakpoint
-      if (GetNumSourceLines() > 0) {
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasProcessScope() && exe_ctx.GetProcessRef().IsAlive()) {
-          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
-              nullptr, // Don't limit the breakpoint to certain modules
-              m_file_sp->GetFileSpec(), // Source file
-              m_selected_line +
-                  1, // Source line number (m_selected_line is zero based)
-              0,     // Unspecified column.
-              0,     // No offset
-              eLazyBoolCalculate,  // Check inlines using global setting
-              eLazyBoolCalculate,  // Skip prologue using global setting,
-              false,               // internal
-              false,               // request_hardware
-              eLazyBoolCalculate); // move_to_nearest_code
-          // Make breakpoint one shot
-          bp_sp->GetOptions()->SetOneShot(true);
-          exe_ctx.GetProcessRef().Resume();
-        }
-      } else if (m_selected_line < GetNumDisassemblyLines()) {
-        const Instruction *inst = m_disassembly_sp->GetInstructionList()
-                                      .GetInstructionAtIndex(m_selected_line)
-                                      .get();
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasTargetScope()) {
-          Address addr = inst->GetAddress();
-          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
-              addr,   // lldb_private::Address
-              false,  // internal
-              false); // request_hardware
-          // Make breakpoint one shot
-          bp_sp->GetOptions()->SetOneShot(true);
-          exe_ctx.GetProcessRef().Resume();
-        }
-      }
-      return eKeyHandled;
-
-    case 'b': // 'b' == toggle breakpoint on currently selected line
-      if (m_selected_line < GetNumSourceLines()) {
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasTargetScope()) {
-          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
-              nullptr, // Don't limit the breakpoint to certain modules
-              m_file_sp->GetFileSpec(), // Source file
-              m_selected_line +
-                  1, // Source line number (m_selected_line is zero based)
-              0,     // No column specified.
-              0,     // No offset
-              eLazyBoolCalculate,  // Check inlines using global setting
-              eLazyBoolCalculate,  // Skip prologue using global setting,
-              false,               // internal
-              false,               // request_hardware
-              eLazyBoolCalculate); // move_to_nearest_code
-        }
-      } else if (m_selected_line < GetNumDisassemblyLines()) {
-        const Instruction *inst = m_disassembly_sp->GetInstructionList()
-                                      .GetInstructionAtIndex(m_selected_line)
-                                      .get();
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasTargetScope()) {
-          Address addr = inst->GetAddress();
-          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
-              addr,   // lldb_private::Address
-              false,  // internal
-              false); // request_hardware
-        }
-      }
-      return eKeyHandled;
-
-    case 'd': // 'd' == detach and let run
-    case 'D': // 'D' == detach and keep stopped
-    {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasProcessScope())
-        exe_ctx.GetProcessRef().Detach(c == 'D');
-    }
-      return eKeyHandled;
-
-    case 'k':
-      // 'k' == kill
-      {
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasProcessScope())
-          exe_ctx.GetProcessRef().Destroy(false);
-      }
-      return eKeyHandled;
-
-    case 'c':
-      // 'c' == continue
-      {
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasProcessScope())
-          exe_ctx.GetProcessRef().Resume();
-      }
-      return eKeyHandled;
-
-    case 'o':
-      // 'o' == step out
-      {
-        ExecutionContext exe_ctx =
-            m_debugger.GetCommandInterpreter().GetExecutionContext();
-        if (exe_ctx.HasThreadScope() &&
-            StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
-          exe_ctx.GetThreadRef().StepOut();
-        }
-      }
-      return eKeyHandled;
-
-    case 'n': // 'n' == step over
-    case 'N': // 'N' == step over instruction
-    {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasThreadScope() &&
-          StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
-        bool source_step = (c == 'n');
-        exe_ctx.GetThreadRef().StepOver(source_step);
-      }
-    }
-      return eKeyHandled;
-
-    case 's': // 's' == step into
-    case 'S': // 'S' == step into instruction
-    {
-      ExecutionContext exe_ctx =
-          m_debugger.GetCommandInterpreter().GetExecutionContext();
-      if (exe_ctx.HasThreadScope() &&
-          StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
-        bool source_step = (c == 's');
-        exe_ctx.GetThreadRef().StepIn(source_step);
-      }
-    }
-      return eKeyHandled;
-
-    case 'h':
-      window.CreateHelpSubwindow();
-      return eKeyHandled;
-
-    default:
-      break;
-    }
-    return eKeyNotHandled;
-  }
-
-protected:
-  typedef std::set<uint32_t> BreakpointLines;
-  typedef std::set<lldb::addr_t> BreakpointAddrs;
-
-  Debugger &m_debugger;
-  SymbolContext m_sc;
-  SourceManager::FileSP m_file_sp;
-  SymbolContextScope *m_disassembly_scope;
-  lldb::DisassemblerSP m_disassembly_sp;
-  AddressRange m_disassembly_range;
-  StreamString m_title;
-  lldb::user_id_t m_tid;
-  int m_line_width;
-  uint32_t m_selected_line; // The selected line
-  uint32_t m_pc_line;       // The line with the PC
-  uint32_t m_stop_id;
-  uint32_t m_frame_idx;
-  int m_first_visible_line;
-  int m_min_x;
-  int m_min_y;
-  int m_max_x;
-  int m_max_y;
-};
-
-DisplayOptions ValueObjectListDelegate::g_options = {true};
-
-IOHandlerCursesGUI::IOHandlerCursesGUI(Debugger &debugger)
-    : IOHandler(debugger, IOHandler::Type::Curses) {}
-
-void IOHandlerCursesGUI::Activate() {
-  IOHandler::Activate();
-  if (!m_app_ap) {
-    m_app_ap.reset(new Application(GetInputFILE(), GetOutputFILE()));
-
-    // This is both a window and a menu delegate
-    std::shared_ptr<ApplicationDelegate> app_delegate_sp(
-        new ApplicationDelegate(*m_app_ap, m_debugger));
-
-    MenuDelegateSP app_menu_delegate_sp =
-        std::static_pointer_cast<MenuDelegate>(app_delegate_sp);
-    MenuSP lldb_menu_sp(
-        new Menu("LLDB", "F1", KEY_F(1), ApplicationDelegate::eMenuID_LLDB));
-    MenuSP exit_menuitem_sp(
-        new Menu("Exit", nullptr, 'x', ApplicationDelegate::eMenuID_LLDBExit));
-    exit_menuitem_sp->SetCannedResult(MenuActionResult::Quit);
-    lldb_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "About LLDB", nullptr, 'a', ApplicationDelegate::eMenuID_LLDBAbout)));
-    lldb_menu_sp->AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
-    lldb_menu_sp->AddSubmenu(exit_menuitem_sp);
-
-    MenuSP target_menu_sp(new Menu("Target", "F2", KEY_F(2),
-                                   ApplicationDelegate::eMenuID_Target));
-    target_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Create", nullptr, 'c', ApplicationDelegate::eMenuID_TargetCreate)));
-    target_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Delete", nullptr, 'd', ApplicationDelegate::eMenuID_TargetDelete)));
-
-    MenuSP process_menu_sp(new Menu("Process", "F3", KEY_F(3),
-                                    ApplicationDelegate::eMenuID_Process));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Attach", nullptr, 'a', ApplicationDelegate::eMenuID_ProcessAttach)));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Detach", nullptr, 'd', ApplicationDelegate::eMenuID_ProcessDetach)));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Launch", nullptr, 'l', ApplicationDelegate::eMenuID_ProcessLaunch)));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
-    process_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Continue", nullptr, 'c',
-                        ApplicationDelegate::eMenuID_ProcessContinue)));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Halt", nullptr, 'h', ApplicationDelegate::eMenuID_ProcessHalt)));
-    process_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Kill", nullptr, 'k', ApplicationDelegate::eMenuID_ProcessKill)));
-
-    MenuSP thread_menu_sp(new Menu("Thread", "F4", KEY_F(4),
-                                   ApplicationDelegate::eMenuID_Thread));
-    thread_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Step In", nullptr, 'i', ApplicationDelegate::eMenuID_ThreadStepIn)));
-    thread_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Step Over", nullptr, 'v',
-                        ApplicationDelegate::eMenuID_ThreadStepOver)));
-    thread_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Step Out", nullptr, 'o', ApplicationDelegate::eMenuID_ThreadStepOut)));
-
-    MenuSP view_menu_sp(
-        new Menu("View", "F5", KEY_F(5), ApplicationDelegate::eMenuID_View));
-    view_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Backtrace", nullptr, 'b',
-                        ApplicationDelegate::eMenuID_ViewBacktrace)));
-    view_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Registers", nullptr, 'r',
-                        ApplicationDelegate::eMenuID_ViewRegisters)));
-    view_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "Source", nullptr, 's', ApplicationDelegate::eMenuID_ViewSource)));
-    view_menu_sp->AddSubmenu(
-        MenuSP(new Menu("Variables", nullptr, 'v',
-                        ApplicationDelegate::eMenuID_ViewVariables)));
-
-    MenuSP help_menu_sp(
-        new Menu("Help", "F6", KEY_F(6), ApplicationDelegate::eMenuID_Help));
-    help_menu_sp->AddSubmenu(MenuSP(new Menu(
-        "GUI Help", nullptr, 'g', ApplicationDelegate::eMenuID_HelpGUIHelp)));
-
-    m_app_ap->Initialize();
-    WindowSP &main_window_sp = m_app_ap->GetMainWindow();
-
-    MenuSP menubar_sp(new Menu(Menu::Type::Bar));
-    menubar_sp->AddSubmenu(lldb_menu_sp);
-    menubar_sp->AddSubmenu(target_menu_sp);
-    menubar_sp->AddSubmenu(process_menu_sp);
-    menubar_sp->AddSubmenu(thread_menu_sp);
-    menubar_sp->AddSubmenu(view_menu_sp);
-    menubar_sp->AddSubmenu(help_menu_sp);
-    menubar_sp->SetDelegate(app_menu_delegate_sp);
-
-    Rect content_bounds = main_window_sp->GetFrame();
-    Rect menubar_bounds = content_bounds.MakeMenuBar();
-    Rect status_bounds = content_bounds.MakeStatusBar();
-    Rect source_bounds;
-    Rect variables_bounds;
-    Rect threads_bounds;
-    Rect source_variables_bounds;
-    content_bounds.VerticalSplitPercentage(0.80, source_variables_bounds,
-                                           threads_bounds);
-    source_variables_bounds.HorizontalSplitPercentage(0.70, source_bounds,
-                                                      variables_bounds);
-
-    WindowSP menubar_window_sp =
-        main_window_sp->CreateSubWindow("Menubar", menubar_bounds, false);
-    // Let the menubar get keys if the active window doesn't handle the keys
-    // that are typed so it can respond to menubar key presses.
-    menubar_window_sp->SetCanBeActive(
-        false); // Don't let the menubar become the active window
-    menubar_window_sp->SetDelegate(menubar_sp);
-
-    WindowSP source_window_sp(
-        main_window_sp->CreateSubWindow("Source", source_bounds, true));
-    WindowSP variables_window_sp(
-        main_window_sp->CreateSubWindow("Variables", variables_bounds, false));
-    WindowSP threads_window_sp(
-        main_window_sp->CreateSubWindow("Threads", threads_bounds, false));
-    WindowSP status_window_sp(
-        main_window_sp->CreateSubWindow("Status", status_bounds, false));
-    status_window_sp->SetCanBeActive(
-        false); // Don't let the status bar become the active window
-    main_window_sp->SetDelegate(
-        std::static_pointer_cast<WindowDelegate>(app_delegate_sp));
-    source_window_sp->SetDelegate(
-        WindowDelegateSP(new SourceFileWindowDelegate(m_debugger)));
-    variables_window_sp->SetDelegate(
-        WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger)));
-    TreeDelegateSP thread_delegate_sp(new ThreadsTreeDelegate(m_debugger));
-    threads_window_sp->SetDelegate(WindowDelegateSP(
-        new TreeWindowDelegate(m_debugger, thread_delegate_sp)));
-    status_window_sp->SetDelegate(
-        WindowDelegateSP(new StatusBarWindowDelegate(m_debugger)));
-
-    // Show the main help window once the first time the curses GUI is launched
-    static bool g_showed_help = false;
-    if (!g_showed_help) {
-      g_showed_help = true;
-      main_window_sp->CreateHelpSubwindow();
-    }
-
-    init_pair(1, COLOR_WHITE, COLOR_BLUE);
-    init_pair(2, COLOR_BLACK, COLOR_WHITE);
-    init_pair(3, COLOR_MAGENTA, COLOR_WHITE);
-    init_pair(4, COLOR_MAGENTA, COLOR_BLACK);
-    init_pair(5, COLOR_RED, COLOR_BLACK);
-  }
-}
-
-void IOHandlerCursesGUI::Deactivate() { m_app_ap->Terminate(); }
-
-void IOHandlerCursesGUI::Run() {
-  m_app_ap->Run(m_debugger);
-  SetIsDone(true);
-}
-
-IOHandlerCursesGUI::~IOHandlerCursesGUI() = default;
-
-void IOHandlerCursesGUI::Cancel() {}
-
-bool IOHandlerCursesGUI::Interrupt() { return false; }
-
-void IOHandlerCursesGUI::GotEOF() {}
-
-#endif // LLDB_DISABLE_CURSES
diff --git a/lldb/source/Core/IOHandlerCursesGUI.cpp b/lldb/source/Core/IOHandlerCursesGUI.cpp
new file mode 100644
index 0000000000000..a9114aa71b069
--- /dev/null
+++ b/lldb/source/Core/IOHandlerCursesGUI.cpp
@@ -0,0 +1,4066 @@
+//===-- IOHandlerCursesGUI.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/IOHandlerCursesGUI.h"
+
+#ifndef LLDB_DISABLE_CURSES
+#include <curses.h>
+#include <panel.h>
+#endif
+
+#if defined(__APPLE__)
+#include <deque>
+#endif
+#include <string>
+
+#include "lldb/Core/Debugger.h"
+#include "lldb/Core/StreamFile.h"
+#include "lldb/Host/File.h"
+#include "lldb/Utility/Predicate.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/Utility/StreamString.h"
+#include "lldb/Utility/StringList.h"
+#include "lldb/lldb-forward.h"
+
+#include "lldb/Interpreter/CommandCompletions.h"
+#include "lldb/Interpreter/CommandInterpreter.h"
+
+#ifndef LLDB_DISABLE_CURSES
+#include "lldb/Breakpoint/BreakpointLocation.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ValueObject.h"
+#include "lldb/Core/ValueObjectRegister.h"
+#include "lldb/Symbol/Block.h"
+#include "lldb/Symbol/Function.h"
+#include "lldb/Symbol/Symbol.h"
+#include "lldb/Symbol/VariableList.h"
+#include "lldb/Target/Process.h"
+#include "lldb/Target/RegisterContext.h"
+#include "lldb/Target/StackFrame.h"
+#include "lldb/Target/StopInfo.h"
+#include "lldb/Target/Target.h"
+#include "lldb/Target/Thread.h"
+#include "lldb/Utility/State.h"
+#endif
+
+#include "llvm/ADT/StringRef.h"
+
+#ifdef _WIN32
+#include "lldb/Host/windows/windows.h"
+#endif
+
+#include <memory>
+#include <mutex>
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <locale.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <type_traits>
+
+using namespace lldb;
+using namespace lldb_private;
+using llvm::None;
+using llvm::Optional;
+using llvm::StringRef;
+
+// we may want curses to be disabled for some builds for instance, windows
+#ifndef LLDB_DISABLE_CURSES
+
+#define KEY_RETURN 10
+#define KEY_ESCAPE 27
+
+namespace curses {
+class Menu;
+class MenuDelegate;
+class Window;
+class WindowDelegate;
+typedef std::shared_ptr<Menu> MenuSP;
+typedef std::shared_ptr<MenuDelegate> MenuDelegateSP;
+typedef std::shared_ptr<Window> WindowSP;
+typedef std::shared_ptr<WindowDelegate> WindowDelegateSP;
+typedef std::vector<MenuSP> Menus;
+typedef std::vector<WindowSP> Windows;
+typedef std::vector<WindowDelegateSP> WindowDelegates;
+
+#if 0
+type summary add -s "x=${var.x}, y=${var.y}" curses::Point
+type summary add -s "w=${var.width}, h=${var.height}" curses::Size
+type summary add -s "${var.origin%S} ${var.size%S}" curses::Rect
+#endif
+
+struct Point {
+  int x;
+  int y;
+
+  Point(int _x = 0, int _y = 0) : x(_x), y(_y) {}
+
+  void Clear() {
+    x = 0;
+    y = 0;
+  }
+
+  Point &operator+=(const Point &rhs) {
+    x += rhs.x;
+    y += rhs.y;
+    return *this;
+  }
+
+  void Dump() { printf("(x=%i, y=%i)\n", x, y); }
+};
+
+bool operator==(const Point &lhs, const Point &rhs) {
+  return lhs.x == rhs.x && lhs.y == rhs.y;
+}
+
+bool operator!=(const Point &lhs, const Point &rhs) {
+  return lhs.x != rhs.x || lhs.y != rhs.y;
+}
+
+struct Size {
+  int width;
+  int height;
+  Size(int w = 0, int h = 0) : width(w), height(h) {}
+
+  void Clear() {
+    width = 0;
+    height = 0;
+  }
+
+  void Dump() { printf("(w=%i, h=%i)\n", width, height); }
+};
+
+bool operator==(const Size &lhs, const Size &rhs) {
+  return lhs.width == rhs.width && lhs.height == rhs.height;
+}
+
+bool operator!=(const Size &lhs, const Size &rhs) {
+  return lhs.width != rhs.width || lhs.height != rhs.height;
+}
+
+struct Rect {
+  Point origin;
+  Size size;
+
+  Rect() : origin(), size() {}
+
+  Rect(const Point &p, const Size &s) : origin(p), size(s) {}
+
+  void Clear() {
+    origin.Clear();
+    size.Clear();
+  }
+
+  void Dump() {
+    printf("(x=%i, y=%i), w=%i, h=%i)\n", origin.x, origin.y, size.width,
+           size.height);
+  }
+
+  void Inset(int w, int h) {
+    if (size.width > w * 2)
+      size.width -= w * 2;
+    origin.x += w;
+
+    if (size.height > h * 2)
+      size.height -= h * 2;
+    origin.y += h;
+  }
+
+  // Return a status bar rectangle which is the last line of this rectangle.
+  // This rectangle will be modified to not include the status bar area.
+  Rect MakeStatusBar() {
+    Rect status_bar;
+    if (size.height > 1) {
+      status_bar.origin.x = origin.x;
+      status_bar.origin.y = size.height;
+      status_bar.size.width = size.width;
+      status_bar.size.height = 1;
+      --size.height;
+    }
+    return status_bar;
+  }
+
+  // Return a menubar rectangle which is the first line of this rectangle. This
+  // rectangle will be modified to not include the menubar area.
+  Rect MakeMenuBar() {
+    Rect menubar;
+    if (size.height > 1) {
+      menubar.origin.x = origin.x;
+      menubar.origin.y = origin.y;
+      menubar.size.width = size.width;
+      menubar.size.height = 1;
+      ++origin.y;
+      --size.height;
+    }
+    return menubar;
+  }
+
+  void HorizontalSplitPercentage(float top_percentage, Rect &top,
+                                 Rect &bottom) const {
+    float top_height = top_percentage * size.height;
+    HorizontalSplit(top_height, top, bottom);
+  }
+
+  void HorizontalSplit(int top_height, Rect &top, Rect &bottom) const {
+    top = *this;
+    if (top_height < size.height) {
+      top.size.height = top_height;
+      bottom.origin.x = origin.x;
+      bottom.origin.y = origin.y + top.size.height;
+      bottom.size.width = size.width;
+      bottom.size.height = size.height - top.size.height;
+    } else {
+      bottom.Clear();
+    }
+  }
+
+  void VerticalSplitPercentage(float left_percentage, Rect &left,
+                               Rect &right) const {
+    float left_width = left_percentage * size.width;
+    VerticalSplit(left_width, left, right);
+  }
+
+  void VerticalSplit(int left_width, Rect &left, Rect &right) const {
+    left = *this;
+    if (left_width < size.width) {
+      left.size.width = left_width;
+      right.origin.x = origin.x + left.size.width;
+      right.origin.y = origin.y;
+      right.size.width = size.width - left.size.width;
+      right.size.height = size.height;
+    } else {
+      right.Clear();
+    }
+  }
+};
+
+bool operator==(const Rect &lhs, const Rect &rhs) {
+  return lhs.origin == rhs.origin && lhs.size == rhs.size;
+}
+
+bool operator!=(const Rect &lhs, const Rect &rhs) {
+  return lhs.origin != rhs.origin || lhs.size != rhs.size;
+}
+
+enum HandleCharResult {
+  eKeyNotHandled = 0,
+  eKeyHandled = 1,
+  eQuitApplication = 2
+};
+
+enum class MenuActionResult {
+  Handled,
+  NotHandled,
+  Quit // Exit all menus and quit
+};
+
+struct KeyHelp {
+  int ch;
+  const char *description;
+};
+
+class WindowDelegate {
+public:
+  virtual ~WindowDelegate() = default;
+
+  virtual bool WindowDelegateDraw(Window &window, bool force) {
+    return false; // Drawing not handled
+  }
+
+  virtual HandleCharResult WindowDelegateHandleChar(Window &window, int key) {
+    return eKeyNotHandled;
+  }
+
+  virtual const char *WindowDelegateGetHelpText() { return nullptr; }
+
+  virtual KeyHelp *WindowDelegateGetKeyHelp() { return nullptr; }
+};
+
+class HelpDialogDelegate : public WindowDelegate {
+public:
+  HelpDialogDelegate(const char *text, KeyHelp *key_help_array);
+
+  ~HelpDialogDelegate() override;
+
+  bool WindowDelegateDraw(Window &window, bool force) override;
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override;
+
+  size_t GetNumLines() const { return m_text.GetSize(); }
+
+  size_t GetMaxLineLength() const { return m_text.GetMaxStringLength(); }
+
+protected:
+  StringList m_text;
+  int m_first_visible_line;
+};
+
+class Window {
+public:
+  Window(const char *name)
+      : m_name(name), m_window(nullptr), m_panel(nullptr), m_parent(nullptr),
+        m_subwindows(), m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
+        m_prev_active_window_idx(UINT32_MAX), m_delete(false),
+        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {}
+
+  Window(const char *name, WINDOW *w, bool del = true)
+      : m_name(name), m_window(nullptr), m_panel(nullptr), m_parent(nullptr),
+        m_subwindows(), m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
+        m_prev_active_window_idx(UINT32_MAX), m_delete(del),
+        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
+    if (w)
+      Reset(w);
+  }
+
+  Window(const char *name, const Rect &bounds)
+      : m_name(name), m_window(nullptr), m_parent(nullptr), m_subwindows(),
+        m_delegate_sp(), m_curr_active_window_idx(UINT32_MAX),
+        m_prev_active_window_idx(UINT32_MAX), m_delete(true),
+        m_needs_update(true), m_can_activate(true), m_is_subwin(false) {
+    Reset(::newwin(bounds.size.height, bounds.size.width, bounds.origin.y,
+                   bounds.origin.y));
+  }
+
+  virtual ~Window() {
+    RemoveSubWindows();
+    Reset();
+  }
+
+  void Reset(WINDOW *w = nullptr, bool del = true) {
+    if (m_window == w)
+      return;
+
+    if (m_panel) {
+      ::del_panel(m_panel);
+      m_panel = nullptr;
+    }
+    if (m_window && m_delete) {
+      ::delwin(m_window);
+      m_window = nullptr;
+      m_delete = false;
+    }
+    if (w) {
+      m_window = w;
+      m_panel = ::new_panel(m_window);
+      m_delete = del;
+    }
+  }
+
+  void AttributeOn(attr_t attr) { ::wattron(m_window, attr); }
+  void AttributeOff(attr_t attr) { ::wattroff(m_window, attr); }
+  void Box(chtype v_char = ACS_VLINE, chtype h_char = ACS_HLINE) {
+    ::box(m_window, v_char, h_char);
+  }
+  void Clear() { ::wclear(m_window); }
+  void Erase() { ::werase(m_window); }
+  Rect GetBounds() {
+    return Rect(GetParentOrigin(), GetSize());
+  } // Get the rectangle in our parent window
+  int GetChar() { return ::wgetch(m_window); }
+  int GetCursorX() { return getcurx(m_window); }
+  int GetCursorY() { return getcury(m_window); }
+  Rect GetFrame() {
+    return Rect(Point(), GetSize());
+  } // Get our rectangle in our own coordinate system
+  Point GetParentOrigin() { return Point(GetParentX(), GetParentY()); }
+  Size GetSize() { return Size(GetWidth(), GetHeight()); }
+  int GetParentX() { return getparx(m_window); }
+  int GetParentY() { return getpary(m_window); }
+  int GetMaxX() { return getmaxx(m_window); }
+  int GetMaxY() { return getmaxy(m_window); }
+  int GetWidth() { return GetMaxX(); }
+  int GetHeight() { return GetMaxY(); }
+  void MoveCursor(int x, int y) { ::wmove(m_window, y, x); }
+  void MoveWindow(int x, int y) { MoveWindow(Point(x, y)); }
+  void Resize(int w, int h) { ::wresize(m_window, h, w); }
+  void Resize(const Size &size) {
+    ::wresize(m_window, size.height, size.width);
+  }
+  void PutChar(int ch) { ::waddch(m_window, ch); }
+  void PutCString(const char *s, int len = -1) { ::waddnstr(m_window, s, len); }
+  void SetBackground(int color_pair_idx) {
+    ::wbkgd(m_window, COLOR_PAIR(color_pair_idx));
+  }
+
+  void PutCStringTruncated(const char *s, int right_pad) {
+    int bytes_left = GetWidth() - GetCursorX();
+    if (bytes_left > right_pad) {
+      bytes_left -= right_pad;
+      ::waddnstr(m_window, s, bytes_left);
+    }
+  }
+
+  void MoveWindow(const Point &origin) {
+    const bool moving_window = origin != GetParentOrigin();
+    if (m_is_subwin && moving_window) {
+      // Can't move subwindows, must delete and re-create
+      Size size = GetSize();
+      Reset(::subwin(m_parent->m_window, size.height, size.width, origin.y,
+                     origin.x),
+            true);
+    } else {
+      ::mvwin(m_window, origin.y, origin.x);
+    }
+  }
+
+  void SetBounds(const Rect &bounds) {
+    const bool moving_window = bounds.origin != GetParentOrigin();
+    if (m_is_subwin && moving_window) {
+      // Can't move subwindows, must delete and re-create
+      Reset(::subwin(m_parent->m_window, bounds.size.height, bounds.size.width,
+                     bounds.origin.y, bounds.origin.x),
+            true);
+    } else {
+      if (moving_window)
+        MoveWindow(bounds.origin);
+      Resize(bounds.size);
+    }
+  }
+
+  void Printf(const char *format, ...) __attribute__((format(printf, 2, 3))) {
+    va_list args;
+    va_start(args, format);
+    vwprintw(m_window, format, args);
+    va_end(args);
+  }
+
+  void Touch() {
+    ::touchwin(m_window);
+    if (m_parent)
+      m_parent->Touch();
+  }
+
+  WindowSP CreateSubWindow(const char *name, const Rect &bounds,
+                           bool make_active) {
+    auto get_window = [this, &bounds]() {
+      return m_window
+                 ? ::subwin(m_window, bounds.size.height, bounds.size.width,
+                            bounds.origin.y, bounds.origin.x)
+                 : ::newwin(bounds.size.height, bounds.size.width,
+                            bounds.origin.y, bounds.origin.x);
+    };
+    WindowSP subwindow_sp = std::make_shared<Window>(name, get_window(), true);
+    subwindow_sp->m_is_subwin = subwindow_sp.operator bool();
+    subwindow_sp->m_parent = this;
+    if (make_active) {
+      m_prev_active_window_idx = m_curr_active_window_idx;
+      m_curr_active_window_idx = m_subwindows.size();
+    }
+    m_subwindows.push_back(subwindow_sp);
+    ::top_panel(subwindow_sp->m_panel);
+    m_needs_update = true;
+    return subwindow_sp;
+  }
+
+  bool RemoveSubWindow(Window *window) {
+    Windows::iterator pos, end = m_subwindows.end();
+    size_t i = 0;
+    for (pos = m_subwindows.begin(); pos != end; ++pos, ++i) {
+      if ((*pos).get() == window) {
+        if (m_prev_active_window_idx == i)
+          m_prev_active_window_idx = UINT32_MAX;
+        else if (m_prev_active_window_idx != UINT32_MAX &&
+                 m_prev_active_window_idx > i)
+          --m_prev_active_window_idx;
+
+        if (m_curr_active_window_idx == i)
+          m_curr_active_window_idx = UINT32_MAX;
+        else if (m_curr_active_window_idx != UINT32_MAX &&
+                 m_curr_active_window_idx > i)
+          --m_curr_active_window_idx;
+        window->Erase();
+        m_subwindows.erase(pos);
+        m_needs_update = true;
+        if (m_parent)
+          m_parent->Touch();
+        else
+          ::touchwin(stdscr);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  WindowSP FindSubWindow(const char *name) {
+    Windows::iterator pos, end = m_subwindows.end();
+    size_t i = 0;
+    for (pos = m_subwindows.begin(); pos != end; ++pos, ++i) {
+      if ((*pos)->m_name == name)
+        return *pos;
+    }
+    return WindowSP();
+  }
+
+  void RemoveSubWindows() {
+    m_curr_active_window_idx = UINT32_MAX;
+    m_prev_active_window_idx = UINT32_MAX;
+    for (Windows::iterator pos = m_subwindows.begin();
+         pos != m_subwindows.end(); pos = m_subwindows.erase(pos)) {
+      (*pos)->Erase();
+    }
+    if (m_parent)
+      m_parent->Touch();
+    else
+      ::touchwin(stdscr);
+  }
+
+  WINDOW *get() { return m_window; }
+
+  operator WINDOW *() { return m_window; }
+
+  // Window drawing utilities
+  void DrawTitleBox(const char *title, const char *bottom_message = nullptr) {
+    attr_t attr = 0;
+    if (IsActive())
+      attr = A_BOLD | COLOR_PAIR(2);
+    else
+      attr = 0;
+    if (attr)
+      AttributeOn(attr);
+
+    Box();
+    MoveCursor(3, 0);
+
+    if (title && title[0]) {
+      PutChar('<');
+      PutCString(title);
+      PutChar('>');
+    }
+
+    if (bottom_message && bottom_message[0]) {
+      int bottom_message_length = strlen(bottom_message);
+      int x = GetWidth() - 3 - (bottom_message_length + 2);
+
+      if (x > 0) {
+        MoveCursor(x, GetHeight() - 1);
+        PutChar('[');
+        PutCString(bottom_message);
+        PutChar(']');
+      } else {
+        MoveCursor(1, GetHeight() - 1);
+        PutChar('[');
+        PutCStringTruncated(bottom_message, 1);
+      }
+    }
+    if (attr)
+      AttributeOff(attr);
+  }
+
+  virtual void Draw(bool force) {
+    if (m_delegate_sp && m_delegate_sp->WindowDelegateDraw(*this, force))
+      return;
+
+    for (auto &subwindow_sp : m_subwindows)
+      subwindow_sp->Draw(force);
+  }
+
+  bool CreateHelpSubwindow() {
+    if (m_delegate_sp) {
+      const char *text = m_delegate_sp->WindowDelegateGetHelpText();
+      KeyHelp *key_help = m_delegate_sp->WindowDelegateGetKeyHelp();
+      if ((text && text[0]) || key_help) {
+        std::unique_ptr<HelpDialogDelegate> help_delegate_up(
+            new HelpDialogDelegate(text, key_help));
+        const size_t num_lines = help_delegate_up->GetNumLines();
+        const size_t max_length = help_delegate_up->GetMaxLineLength();
+        Rect bounds = GetBounds();
+        bounds.Inset(1, 1);
+        if (max_length + 4 < static_cast<size_t>(bounds.size.width)) {
+          bounds.origin.x += (bounds.size.width - max_length + 4) / 2;
+          bounds.size.width = max_length + 4;
+        } else {
+          if (bounds.size.width > 100) {
+            const int inset_w = bounds.size.width / 4;
+            bounds.origin.x += inset_w;
+            bounds.size.width -= 2 * inset_w;
+          }
+        }
+
+        if (num_lines + 2 < static_cast<size_t>(bounds.size.height)) {
+          bounds.origin.y += (bounds.size.height - num_lines + 2) / 2;
+          bounds.size.height = num_lines + 2;
+        } else {
+          if (bounds.size.height > 100) {
+            const int inset_h = bounds.size.height / 4;
+            bounds.origin.y += inset_h;
+            bounds.size.height -= 2 * inset_h;
+          }
+        }
+        WindowSP help_window_sp;
+        Window *parent_window = GetParent();
+        if (parent_window)
+          help_window_sp = parent_window->CreateSubWindow("Help", bounds, true);
+        else
+          help_window_sp = CreateSubWindow("Help", bounds, true);
+        help_window_sp->SetDelegate(
+            WindowDelegateSP(help_delegate_up.release()));
+        return true;
+      }
+    }
+    return false;
+  }
+
+  virtual HandleCharResult HandleChar(int key) {
+    // Always check the active window first
+    HandleCharResult result = eKeyNotHandled;
+    WindowSP active_window_sp = GetActiveWindow();
+    if (active_window_sp) {
+      result = active_window_sp->HandleChar(key);
+      if (result != eKeyNotHandled)
+        return result;
+    }
+
+    if (m_delegate_sp) {
+      result = m_delegate_sp->WindowDelegateHandleChar(*this, key);
+      if (result != eKeyNotHandled)
+        return result;
+    }
+
+    // Then check for any windows that want any keys that weren't handled. This
+    // is typically only for a menubar. Make a copy of the subwindows in case
+    // any HandleChar() functions muck with the subwindows. If we don't do
+    // this, we can crash when iterating over the subwindows.
+    Windows subwindows(m_subwindows);
+    for (auto subwindow_sp : subwindows) {
+      if (!subwindow_sp->m_can_activate) {
+        HandleCharResult result = subwindow_sp->HandleChar(key);
+        if (result != eKeyNotHandled)
+          return result;
+      }
+    }
+
+    return eKeyNotHandled;
+  }
+
+  WindowSP GetActiveWindow() {
+    if (!m_subwindows.empty()) {
+      if (m_curr_active_window_idx >= m_subwindows.size()) {
+        if (m_prev_active_window_idx < m_subwindows.size()) {
+          m_curr_active_window_idx = m_prev_active_window_idx;
+          m_prev_active_window_idx = UINT32_MAX;
+        } else if (IsActive()) {
+          m_prev_active_window_idx = UINT32_MAX;
+          m_curr_active_window_idx = UINT32_MAX;
+
+          // Find first window that wants to be active if this window is active
+          const size_t num_subwindows = m_subwindows.size();
+          for (size_t i = 0; i < num_subwindows; ++i) {
+            if (m_subwindows[i]->GetCanBeActive()) {
+              m_curr_active_window_idx = i;
+              break;
+            }
+          }
+        }
+      }
+
+      if (m_curr_active_window_idx < m_subwindows.size())
+        return m_subwindows[m_curr_active_window_idx];
+    }
+    return WindowSP();
+  }
+
+  bool GetCanBeActive() const { return m_can_activate; }
+
+  void SetCanBeActive(bool b) { m_can_activate = b; }
+
+  void SetDelegate(const WindowDelegateSP &delegate_sp) {
+    m_delegate_sp = delegate_sp;
+  }
+
+  Window *GetParent() const { return m_parent; }
+
+  bool IsActive() const {
+    if (m_parent)
+      return m_parent->GetActiveWindow().get() == this;
+    else
+      return true; // Top level window is always active
+  }
+
+  void SelectNextWindowAsActive() {
+    // Move active focus to next window
+    const size_t num_subwindows = m_subwindows.size();
+    if (m_curr_active_window_idx == UINT32_MAX) {
+      uint32_t idx = 0;
+      for (auto subwindow_sp : m_subwindows) {
+        if (subwindow_sp->GetCanBeActive()) {
+          m_curr_active_window_idx = idx;
+          break;
+        }
+        ++idx;
+      }
+    } else if (m_curr_active_window_idx + 1 < num_subwindows) {
+      bool handled = false;
+      m_prev_active_window_idx = m_curr_active_window_idx;
+      for (size_t idx = m_curr_active_window_idx + 1; idx < num_subwindows;
+           ++idx) {
+        if (m_subwindows[idx]->GetCanBeActive()) {
+          m_curr_active_window_idx = idx;
+          handled = true;
+          break;
+        }
+      }
+      if (!handled) {
+        for (size_t idx = 0; idx <= m_prev_active_window_idx; ++idx) {
+          if (m_subwindows[idx]->GetCanBeActive()) {
+            m_curr_active_window_idx = idx;
+            break;
+          }
+        }
+      }
+    } else {
+      m_prev_active_window_idx = m_curr_active_window_idx;
+      for (size_t idx = 0; idx < num_subwindows; ++idx) {
+        if (m_subwindows[idx]->GetCanBeActive()) {
+          m_curr_active_window_idx = idx;
+          break;
+        }
+      }
+    }
+  }
+
+  const char *GetName() const { return m_name.c_str(); }
+
+protected:
+  std::string m_name;
+  WINDOW *m_window;
+  PANEL *m_panel;
+  Window *m_parent;
+  Windows m_subwindows;
+  WindowDelegateSP m_delegate_sp;
+  uint32_t m_curr_active_window_idx;
+  uint32_t m_prev_active_window_idx;
+  bool m_delete;
+  bool m_needs_update;
+  bool m_can_activate;
+  bool m_is_subwin;
+
+private:
+  DISALLOW_COPY_AND_ASSIGN(Window);
+};
+
+class MenuDelegate {
+public:
+  virtual ~MenuDelegate() = default;
+
+  virtual MenuActionResult MenuDelegateAction(Menu &menu) = 0;
+};
+
+class Menu : public WindowDelegate {
+public:
+  enum class Type { Invalid, Bar, Item, Separator };
+
+  // Menubar or separator constructor
+  Menu(Type type);
+
+  // Menuitem constructor
+  Menu(const char *name, const char *key_name, int key_value,
+       uint64_t identifier);
+
+  ~Menu() override = default;
+
+  const MenuDelegateSP &GetDelegate() const { return m_delegate_sp; }
+
+  void SetDelegate(const MenuDelegateSP &delegate_sp) {
+    m_delegate_sp = delegate_sp;
+  }
+
+  void RecalculateNameLengths();
+
+  void AddSubmenu(const MenuSP &menu_sp);
+
+  int DrawAndRunMenu(Window &window);
+
+  void DrawMenuTitle(Window &window, bool highlight);
+
+  bool WindowDelegateDraw(Window &window, bool force) override;
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override;
+
+  MenuActionResult ActionPrivate(Menu &menu) {
+    MenuActionResult result = MenuActionResult::NotHandled;
+    if (m_delegate_sp) {
+      result = m_delegate_sp->MenuDelegateAction(menu);
+      if (result != MenuActionResult::NotHandled)
+        return result;
+    } else if (m_parent) {
+      result = m_parent->ActionPrivate(menu);
+      if (result != MenuActionResult::NotHandled)
+        return result;
+    }
+    return m_canned_result;
+  }
+
+  MenuActionResult Action() {
+    // Call the recursive action so it can try to handle it with the menu
+    // delegate, and if not, try our parent menu
+    return ActionPrivate(*this);
+  }
+
+  void SetCannedResult(MenuActionResult result) { m_canned_result = result; }
+
+  Menus &GetSubmenus() { return m_submenus; }
+
+  const Menus &GetSubmenus() const { return m_submenus; }
+
+  int GetSelectedSubmenuIndex() const { return m_selected; }
+
+  void SetSelectedSubmenuIndex(int idx) { m_selected = idx; }
+
+  Type GetType() const { return m_type; }
+
+  int GetStartingColumn() const { return m_start_col; }
+
+  void SetStartingColumn(int col) { m_start_col = col; }
+
+  int GetKeyValue() const { return m_key_value; }
+
+  std::string &GetName() { return m_name; }
+
+  int GetDrawWidth() const {
+    return m_max_submenu_name_length + m_max_submenu_key_name_length + 8;
+  }
+
+  uint64_t GetIdentifier() const { return m_identifier; }
+
+  void SetIdentifier(uint64_t identifier) { m_identifier = identifier; }
+
+protected:
+  std::string m_name;
+  std::string m_key_name;
+  uint64_t m_identifier;
+  Type m_type;
+  int m_key_value;
+  int m_start_col;
+  int m_max_submenu_name_length;
+  int m_max_submenu_key_name_length;
+  int m_selected;
+  Menu *m_parent;
+  Menus m_submenus;
+  WindowSP m_menu_window_sp;
+  MenuActionResult m_canned_result;
+  MenuDelegateSP m_delegate_sp;
+};
+
+// Menubar or separator constructor
+Menu::Menu(Type type)
+    : m_name(), m_key_name(), m_identifier(0), m_type(type), m_key_value(0),
+      m_start_col(0), m_max_submenu_name_length(0),
+      m_max_submenu_key_name_length(0), m_selected(0), m_parent(nullptr),
+      m_submenus(), m_canned_result(MenuActionResult::NotHandled),
+      m_delegate_sp() {}
+
+// Menuitem constructor
+Menu::Menu(const char *name, const char *key_name, int key_value,
+           uint64_t identifier)
+    : m_name(), m_key_name(), m_identifier(identifier), m_type(Type::Invalid),
+      m_key_value(key_value), m_start_col(0), m_max_submenu_name_length(0),
+      m_max_submenu_key_name_length(0), m_selected(0), m_parent(nullptr),
+      m_submenus(), m_canned_result(MenuActionResult::NotHandled),
+      m_delegate_sp() {
+  if (name && name[0]) {
+    m_name = name;
+    m_type = Type::Item;
+    if (key_name && key_name[0])
+      m_key_name = key_name;
+  } else {
+    m_type = Type::Separator;
+  }
+}
+
+void Menu::RecalculateNameLengths() {
+  m_max_submenu_name_length = 0;
+  m_max_submenu_key_name_length = 0;
+  Menus &submenus = GetSubmenus();
+  const size_t num_submenus = submenus.size();
+  for (size_t i = 0; i < num_submenus; ++i) {
+    Menu *submenu = submenus[i].get();
+    if (static_cast<size_t>(m_max_submenu_name_length) < submenu->m_name.size())
+      m_max_submenu_name_length = submenu->m_name.size();
+    if (static_cast<size_t>(m_max_submenu_key_name_length) <
+        submenu->m_key_name.size())
+      m_max_submenu_key_name_length = submenu->m_key_name.size();
+  }
+}
+
+void Menu::AddSubmenu(const MenuSP &menu_sp) {
+  menu_sp->m_parent = this;
+  if (static_cast<size_t>(m_max_submenu_name_length) < menu_sp->m_name.size())
+    m_max_submenu_name_length = menu_sp->m_name.size();
+  if (static_cast<size_t>(m_max_submenu_key_name_length) <
+      menu_sp->m_key_name.size())
+    m_max_submenu_key_name_length = menu_sp->m_key_name.size();
+  m_submenus.push_back(menu_sp);
+}
+
+void Menu::DrawMenuTitle(Window &window, bool highlight) {
+  if (m_type == Type::Separator) {
+    window.MoveCursor(0, window.GetCursorY());
+    window.PutChar(ACS_LTEE);
+    int width = window.GetWidth();
+    if (width > 2) {
+      width -= 2;
+      for (int i = 0; i < width; ++i)
+        window.PutChar(ACS_HLINE);
+    }
+    window.PutChar(ACS_RTEE);
+  } else {
+    const int shortcut_key = m_key_value;
+    bool underlined_shortcut = false;
+    const attr_t hilgight_attr = A_REVERSE;
+    if (highlight)
+      window.AttributeOn(hilgight_attr);
+    if (isprint(shortcut_key)) {
+      size_t lower_pos = m_name.find(tolower(shortcut_key));
+      size_t upper_pos = m_name.find(toupper(shortcut_key));
+      const char *name = m_name.c_str();
+      size_t pos = std::min<size_t>(lower_pos, upper_pos);
+      if (pos != std::string::npos) {
+        underlined_shortcut = true;
+        if (pos > 0) {
+          window.PutCString(name, pos);
+          name += pos;
+        }
+        const attr_t shortcut_attr = A_UNDERLINE | A_BOLD;
+        window.AttributeOn(shortcut_attr);
+        window.PutChar(name[0]);
+        window.AttributeOff(shortcut_attr);
+        name++;
+        if (name[0])
+          window.PutCString(name);
+      }
+    }
+
+    if (!underlined_shortcut) {
+      window.PutCString(m_name.c_str());
+    }
+
+    if (highlight)
+      window.AttributeOff(hilgight_attr);
+
+    if (m_key_name.empty()) {
+      if (!underlined_shortcut && isprint(m_key_value)) {
+        window.AttributeOn(COLOR_PAIR(3));
+        window.Printf(" (%c)", m_key_value);
+        window.AttributeOff(COLOR_PAIR(3));
+      }
+    } else {
+      window.AttributeOn(COLOR_PAIR(3));
+      window.Printf(" (%s)", m_key_name.c_str());
+      window.AttributeOff(COLOR_PAIR(3));
+    }
+  }
+}
+
+bool Menu::WindowDelegateDraw(Window &window, bool force) {
+  Menus &submenus = GetSubmenus();
+  const size_t num_submenus = submenus.size();
+  const int selected_idx = GetSelectedSubmenuIndex();
+  Menu::Type menu_type = GetType();
+  switch (menu_type) {
+  case Menu::Type::Bar: {
+    window.SetBackground(2);
+    window.MoveCursor(0, 0);
+    for (size_t i = 0; i < num_submenus; ++i) {
+      Menu *menu = submenus[i].get();
+      if (i > 0)
+        window.PutChar(' ');
+      menu->SetStartingColumn(window.GetCursorX());
+      window.PutCString("| ");
+      menu->DrawMenuTitle(window, false);
+    }
+    window.PutCString(" |");
+  } break;
+
+  case Menu::Type::Item: {
+    int y = 1;
+    int x = 3;
+    // Draw the menu
+    int cursor_x = 0;
+    int cursor_y = 0;
+    window.Erase();
+    window.SetBackground(2);
+    window.Box();
+    for (size_t i = 0; i < num_submenus; ++i) {
+      const bool is_selected = (i == static_cast<size_t>(selected_idx));
+      window.MoveCursor(x, y + i);
+      if (is_selected) {
+        // Remember where we want the cursor to be
+        cursor_x = x - 1;
+        cursor_y = y + i;
+      }
+      submenus[i]->DrawMenuTitle(window, is_selected);
+    }
+    window.MoveCursor(cursor_x, cursor_y);
+  } break;
+
+  default:
+  case Menu::Type::Separator:
+    break;
+  }
+  return true; // Drawing handled...
+}
+
+HandleCharResult Menu::WindowDelegateHandleChar(Window &window, int key) {
+  HandleCharResult result = eKeyNotHandled;
+
+  Menus &submenus = GetSubmenus();
+  const size_t num_submenus = submenus.size();
+  const int selected_idx = GetSelectedSubmenuIndex();
+  Menu::Type menu_type = GetType();
+  if (menu_type == Menu::Type::Bar) {
+    MenuSP run_menu_sp;
+    switch (key) {
+    case KEY_DOWN:
+    case KEY_UP:
+      // Show last menu or first menu
+      if (selected_idx < static_cast<int>(num_submenus))
+        run_menu_sp = submenus[selected_idx];
+      else if (!submenus.empty())
+        run_menu_sp = submenus.front();
+      result = eKeyHandled;
+      break;
+
+    case KEY_RIGHT:
+      ++m_selected;
+      if (m_selected >= static_cast<int>(num_submenus))
+        m_selected = 0;
+      if (m_selected < static_cast<int>(num_submenus))
+        run_menu_sp = submenus[m_selected];
+      else if (!submenus.empty())
+        run_menu_sp = submenus.front();
+      result = eKeyHandled;
+      break;
+
+    case KEY_LEFT:
+      --m_selected;
+      if (m_selected < 0)
+        m_selected = num_submenus - 1;
+      if (m_selected < static_cast<int>(num_submenus))
+        run_menu_sp = submenus[m_selected];
+      else if (!submenus.empty())
+        run_menu_sp = submenus.front();
+      result = eKeyHandled;
+      break;
+
+    default:
+      for (size_t i = 0; i < num_submenus; ++i) {
+        if (submenus[i]->GetKeyValue() == key) {
+          SetSelectedSubmenuIndex(i);
+          run_menu_sp = submenus[i];
+          result = eKeyHandled;
+          break;
+        }
+      }
+      break;
+    }
+
+    if (run_menu_sp) {
+      // Run the action on this menu in case we need to populate the menu with
+      // dynamic content and also in case check marks, and any other menu
+      // decorations need to be calculated
+      if (run_menu_sp->Action() == MenuActionResult::Quit)
+        return eQuitApplication;
+
+      Rect menu_bounds;
+      menu_bounds.origin.x = run_menu_sp->GetStartingColumn();
+      menu_bounds.origin.y = 1;
+      menu_bounds.size.width = run_menu_sp->GetDrawWidth();
+      menu_bounds.size.height = run_menu_sp->GetSubmenus().size() + 2;
+      if (m_menu_window_sp)
+        window.GetParent()->RemoveSubWindow(m_menu_window_sp.get());
+
+      m_menu_window_sp = window.GetParent()->CreateSubWindow(
+          run_menu_sp->GetName().c_str(), menu_bounds, true);
+      m_menu_window_sp->SetDelegate(run_menu_sp);
+    }
+  } else if (menu_type == Menu::Type::Item) {
+    switch (key) {
+    case KEY_DOWN:
+      if (m_submenus.size() > 1) {
+        const int start_select = m_selected;
+        while (++m_selected != start_select) {
+          if (static_cast<size_t>(m_selected) >= num_submenus)
+            m_selected = 0;
+          if (m_submenus[m_selected]->GetType() == Type::Separator)
+            continue;
+          else
+            break;
+        }
+        return eKeyHandled;
+      }
+      break;
+
+    case KEY_UP:
+      if (m_submenus.size() > 1) {
+        const int start_select = m_selected;
+        while (--m_selected != start_select) {
+          if (m_selected < static_cast<int>(0))
+            m_selected = num_submenus - 1;
+          if (m_submenus[m_selected]->GetType() == Type::Separator)
+            continue;
+          else
+            break;
+        }
+        return eKeyHandled;
+      }
+      break;
+
+    case KEY_RETURN:
+      if (static_cast<size_t>(selected_idx) < num_submenus) {
+        if (submenus[selected_idx]->Action() == MenuActionResult::Quit)
+          return eQuitApplication;
+        window.GetParent()->RemoveSubWindow(&window);
+        return eKeyHandled;
+      }
+      break;
+
+    case KEY_ESCAPE: // Beware: pressing escape key has 1 to 2 second delay in
+                     // case other chars are entered for escaped sequences
+      window.GetParent()->RemoveSubWindow(&window);
+      return eKeyHandled;
+
+    default:
+      for (size_t i = 0; i < num_submenus; ++i) {
+        Menu *menu = submenus[i].get();
+        if (menu->GetKeyValue() == key) {
+          SetSelectedSubmenuIndex(i);
+          window.GetParent()->RemoveSubWindow(&window);
+          if (menu->Action() == MenuActionResult::Quit)
+            return eQuitApplication;
+          return eKeyHandled;
+        }
+      }
+      break;
+    }
+  } else if (menu_type == Menu::Type::Separator) {
+  }
+  return result;
+}
+
+class Application {
+public:
+  Application(FILE *in, FILE *out)
+      : m_window_sp(), m_screen(nullptr), m_in(in), m_out(out) {}
+
+  ~Application() {
+    m_window_delegates.clear();
+    m_window_sp.reset();
+    if (m_screen) {
+      ::delscreen(m_screen);
+      m_screen = nullptr;
+    }
+  }
+
+  void Initialize() {
+    ::setlocale(LC_ALL, "");
+    ::setlocale(LC_CTYPE, "");
+    m_screen = ::newterm(nullptr, m_out, m_in);
+    ::start_color();
+    ::curs_set(0);
+    ::noecho();
+    ::keypad(stdscr, TRUE);
+  }
+
+  void Terminate() { ::endwin(); }
+
+  void Run(Debugger &debugger) {
+    bool done = false;
+    int delay_in_tenths_of_a_second = 1;
+
+    // Alas the threading model in curses is a bit lame so we need to resort to
+    // polling every 0.5 seconds. We could poll for stdin ourselves and then
+    // pass the keys down but then we need to translate all of the escape
+    // sequences ourselves. So we resort to polling for input because we need
+    // to receive async process events while in this loop.
+
+    halfdelay(delay_in_tenths_of_a_second); // Poll using some number of tenths
+                                            // of seconds seconds when calling
+                                            // Window::GetChar()
+
+    ListenerSP listener_sp(
+        Listener::MakeListener("lldb.IOHandler.curses.Application"));
+    ConstString broadcaster_class_target(Target::GetStaticBroadcasterClass());
+    ConstString broadcaster_class_process(Process::GetStaticBroadcasterClass());
+    ConstString broadcaster_class_thread(Thread::GetStaticBroadcasterClass());
+    debugger.EnableForwardEvents(listener_sp);
+
+    bool update = true;
+#if defined(__APPLE__)
+    std::deque<int> escape_chars;
+#endif
+
+    while (!done) {
+      if (update) {
+        m_window_sp->Draw(false);
+        // All windows should be calling Window::DeferredRefresh() instead of
+        // Window::Refresh() so we can do a single update and avoid any screen
+        // blinking
+        update_panels();
+
+        // Cursor hiding isn't working on MacOSX, so hide it in the top left
+        // corner
+        m_window_sp->MoveCursor(0, 0);
+
+        doupdate();
+        update = false;
+      }
+
+#if defined(__APPLE__)
+      // Terminal.app doesn't map its function keys correctly, F1-F4 default
+      // to: \033OP, \033OQ, \033OR, \033OS, so lets take care of this here if
+      // possible
+      int ch;
+      if (escape_chars.empty())
+        ch = m_window_sp->GetChar();
+      else {
+        ch = escape_chars.front();
+        escape_chars.pop_front();
+      }
+      if (ch == KEY_ESCAPE) {
+        int ch2 = m_window_sp->GetChar();
+        if (ch2 == 'O') {
+          int ch3 = m_window_sp->GetChar();
+          switch (ch3) {
+          case 'P':
+            ch = KEY_F(1);
+            break;
+          case 'Q':
+            ch = KEY_F(2);
+            break;
+          case 'R':
+            ch = KEY_F(3);
+            break;
+          case 'S':
+            ch = KEY_F(4);
+            break;
+          default:
+            escape_chars.push_back(ch2);
+            if (ch3 != -1)
+              escape_chars.push_back(ch3);
+            break;
+          }
+        } else if (ch2 != -1)
+          escape_chars.push_back(ch2);
+      }
+#else
+      int ch = m_window_sp->GetChar();
+
+#endif
+      if (ch == -1) {
+        if (feof(m_in) || ferror(m_in)) {
+          done = true;
+        } else {
+          // Just a timeout from using halfdelay(), check for events
+          EventSP event_sp;
+          while (listener_sp->PeekAtNextEvent()) {
+            listener_sp->GetEvent(event_sp, std::chrono::seconds(0));
+
+            if (event_sp) {
+              Broadcaster *broadcaster = event_sp->GetBroadcaster();
+              if (broadcaster) {
+                // uint32_t event_type = event_sp->GetType();
+                ConstString broadcaster_class(
+                    broadcaster->GetBroadcasterClass());
+                if (broadcaster_class == broadcaster_class_process) {
+                  debugger.GetCommandInterpreter().UpdateExecutionContext(
+                      nullptr);
+                  update = true;
+                  continue; // Don't get any key, just update our view
+                }
+              }
+            }
+          }
+        }
+      } else {
+        HandleCharResult key_result = m_window_sp->HandleChar(ch);
+        switch (key_result) {
+        case eKeyHandled:
+          debugger.GetCommandInterpreter().UpdateExecutionContext(nullptr);
+          update = true;
+          break;
+        case eKeyNotHandled:
+          break;
+        case eQuitApplication:
+          done = true;
+          break;
+        }
+      }
+    }
+
+    debugger.CancelForwardEvents(listener_sp);
+  }
+
+  WindowSP &GetMainWindow() {
+    if (!m_window_sp)
+      m_window_sp = std::make_shared<Window>("main", stdscr, false);
+    return m_window_sp;
+  }
+
+protected:
+  WindowSP m_window_sp;
+  WindowDelegates m_window_delegates;
+  SCREEN *m_screen;
+  FILE *m_in;
+  FILE *m_out;
+};
+
+} // namespace curses
+
+using namespace curses;
+
+struct Row {
+  ValueObjectManager value;
+  Row *parent;
+  // The process stop ID when the children were calculated.
+  uint32_t children_stop_id;
+  int row_idx;
+  int x;
+  int y;
+  bool might_have_children;
+  bool expanded;
+  bool calculated_children;
+  std::vector<Row> children;
+
+  Row(const ValueObjectSP &v, Row *p)
+      : value(v, lldb::eDynamicDontRunTarget, true), parent(p), row_idx(0),
+        x(1), y(1), might_have_children(v ? v->MightHaveChildren() : false),
+        expanded(false), calculated_children(false), children() {}
+
+  size_t GetDepth() const {
+    if (parent)
+      return 1 + parent->GetDepth();
+    return 0;
+  }
+
+  void Expand() { expanded = true; }
+
+  std::vector<Row> &GetChildren() {
+    ProcessSP process_sp = value.GetProcessSP();
+    auto stop_id = process_sp->GetStopID();
+    if (process_sp && stop_id != children_stop_id) {
+      children_stop_id = stop_id;
+      calculated_children = false;
+    }
+    if (!calculated_children) {
+      children.clear();
+      calculated_children = true;
+      ValueObjectSP valobj = value.GetSP();
+      if (valobj) {
+        const size_t num_children = valobj->GetNumChildren();
+        for (size_t i = 0; i < num_children; ++i) {
+          children.push_back(Row(valobj->GetChildAtIndex(i, true), this));
+        }
+      }
+    }
+    return children;
+  }
+
+  void Unexpand() {
+    expanded = false;
+    calculated_children = false;
+    children.clear();
+  }
+
+  void DrawTree(Window &window) {
+    if (parent)
+      parent->DrawTreeForChild(window, this, 0);
+
+    if (might_have_children) {
+      // It we can get UTF8 characters to work we should try to use the
+      // "symbol" UTF8 string below
+      //            const char *symbol = "";
+      //            if (row.expanded)
+      //                symbol = "\xe2\x96\xbd ";
+      //            else
+      //                symbol = "\xe2\x96\xb7 ";
+      //            window.PutCString (symbol);
+
+      // The ACS_DARROW and ACS_RARROW don't look very nice they are just a 'v'
+      // or '>' character...
+      //            if (expanded)
+      //                window.PutChar (ACS_DARROW);
+      //            else
+      //                window.PutChar (ACS_RARROW);
+      // Since we can't find any good looking right arrow/down arrow symbols,
+      // just use a diamond...
+      window.PutChar(ACS_DIAMOND);
+      window.PutChar(ACS_HLINE);
+    }
+  }
+
+  void DrawTreeForChild(Window &window, Row *child, uint32_t reverse_depth) {
+    if (parent)
+      parent->DrawTreeForChild(window, this, reverse_depth + 1);
+
+    if (&GetChildren().back() == child) {
+      // Last child
+      if (reverse_depth == 0) {
+        window.PutChar(ACS_LLCORNER);
+        window.PutChar(ACS_HLINE);
+      } else {
+        window.PutChar(' ');
+        window.PutChar(' ');
+      }
+    } else {
+      if (reverse_depth == 0) {
+        window.PutChar(ACS_LTEE);
+        window.PutChar(ACS_HLINE);
+      } else {
+        window.PutChar(ACS_VLINE);
+        window.PutChar(' ');
+      }
+    }
+  }
+};
+
+struct DisplayOptions {
+  bool show_types;
+};
+
+class TreeItem;
+
+class TreeDelegate {
+public:
+  TreeDelegate() = default;
+  virtual ~TreeDelegate() = default;
+
+  virtual void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) = 0;
+  virtual void TreeDelegateGenerateChildren(TreeItem &item) = 0;
+  virtual bool TreeDelegateItemSelected(
+      TreeItem &item) = 0; // Return true if we need to update views
+};
+
+typedef std::shared_ptr<TreeDelegate> TreeDelegateSP;
+
+class TreeItem {
+public:
+  TreeItem(TreeItem *parent, TreeDelegate &delegate, bool might_have_children)
+      : m_parent(parent), m_delegate(delegate), m_user_data(nullptr),
+        m_identifier(0), m_row_idx(-1), m_children(),
+        m_might_have_children(might_have_children), m_is_expanded(false) {}
+
+  TreeItem &operator=(const TreeItem &rhs) {
+    if (this != &rhs) {
+      m_parent = rhs.m_parent;
+      m_delegate = rhs.m_delegate;
+      m_user_data = rhs.m_user_data;
+      m_identifier = rhs.m_identifier;
+      m_row_idx = rhs.m_row_idx;
+      m_children = rhs.m_children;
+      m_might_have_children = rhs.m_might_have_children;
+      m_is_expanded = rhs.m_is_expanded;
+    }
+    return *this;
+  }
+
+  TreeItem(const TreeItem &) = default;
+
+  size_t GetDepth() const {
+    if (m_parent)
+      return 1 + m_parent->GetDepth();
+    return 0;
+  }
+
+  int GetRowIndex() const { return m_row_idx; }
+
+  void ClearChildren() { m_children.clear(); }
+
+  void Resize(size_t n, const TreeItem &t) { m_children.resize(n, t); }
+
+  TreeItem &operator[](size_t i) { return m_children[i]; }
+
+  void SetRowIndex(int row_idx) { m_row_idx = row_idx; }
+
+  size_t GetNumChildren() {
+    m_delegate.TreeDelegateGenerateChildren(*this);
+    return m_children.size();
+  }
+
+  void ItemWasSelected() { m_delegate.TreeDelegateItemSelected(*this); }
+
+  void CalculateRowIndexes(int &row_idx) {
+    SetRowIndex(row_idx);
+    ++row_idx;
+
+    const bool expanded = IsExpanded();
+
+    // The root item must calculate its children, or we must calculate the
+    // number of children if the item is expanded
+    if (m_parent == nullptr || expanded)
+      GetNumChildren();
+
+    for (auto &item : m_children) {
+      if (expanded)
+        item.CalculateRowIndexes(row_idx);
+      else
+        item.SetRowIndex(-1);
+    }
+  }
+
+  TreeItem *GetParent() { return m_parent; }
+
+  bool IsExpanded() const { return m_is_expanded; }
+
+  void Expand() { m_is_expanded = true; }
+
+  void Unexpand() { m_is_expanded = false; }
+
+  bool Draw(Window &window, const int first_visible_row,
+            const uint32_t selected_row_idx, int &row_idx, int &num_rows_left) {
+    if (num_rows_left <= 0)
+      return false;
+
+    if (m_row_idx >= first_visible_row) {
+      window.MoveCursor(2, row_idx + 1);
+
+      if (m_parent)
+        m_parent->DrawTreeForChild(window, this, 0);
+
+      if (m_might_have_children) {
+        // It we can get UTF8 characters to work we should try to use the
+        // "symbol" UTF8 string below
+        //            const char *symbol = "";
+        //            if (row.expanded)
+        //                symbol = "\xe2\x96\xbd ";
+        //            else
+        //                symbol = "\xe2\x96\xb7 ";
+        //            window.PutCString (symbol);
+
+        // The ACS_DARROW and ACS_RARROW don't look very nice they are just a
+        // 'v' or '>' character...
+        //            if (expanded)
+        //                window.PutChar (ACS_DARROW);
+        //            else
+        //                window.PutChar (ACS_RARROW);
+        // Since we can't find any good looking right arrow/down arrow symbols,
+        // just use a diamond...
+        window.PutChar(ACS_DIAMOND);
+        window.PutChar(ACS_HLINE);
+      }
+      bool highlight = (selected_row_idx == static_cast<size_t>(m_row_idx)) &&
+                       window.IsActive();
+
+      if (highlight)
+        window.AttributeOn(A_REVERSE);
+
+      m_delegate.TreeDelegateDrawTreeItem(*this, window);
+
+      if (highlight)
+        window.AttributeOff(A_REVERSE);
+      ++row_idx;
+      --num_rows_left;
+    }
+
+    if (num_rows_left <= 0)
+      return false; // We are done drawing...
+
+    if (IsExpanded()) {
+      for (auto &item : m_children) {
+        // If we displayed all the rows and item.Draw() returns false we are
+        // done drawing and can exit this for loop
+        if (!item.Draw(window, first_visible_row, selected_row_idx, row_idx,
+                       num_rows_left))
+          break;
+      }
+    }
+    return num_rows_left >= 0; // Return true if not done drawing yet
+  }
+
+  void DrawTreeForChild(Window &window, TreeItem *child,
+                        uint32_t reverse_depth) {
+    if (m_parent)
+      m_parent->DrawTreeForChild(window, this, reverse_depth + 1);
+
+    if (&m_children.back() == child) {
+      // Last child
+      if (reverse_depth == 0) {
+        window.PutChar(ACS_LLCORNER);
+        window.PutChar(ACS_HLINE);
+      } else {
+        window.PutChar(' ');
+        window.PutChar(' ');
+      }
+    } else {
+      if (reverse_depth == 0) {
+        window.PutChar(ACS_LTEE);
+        window.PutChar(ACS_HLINE);
+      } else {
+        window.PutChar(ACS_VLINE);
+        window.PutChar(' ');
+      }
+    }
+  }
+
+  TreeItem *GetItemForRowIndex(uint32_t row_idx) {
+    if (static_cast<uint32_t>(m_row_idx) == row_idx)
+      return this;
+    if (m_children.empty())
+      return nullptr;
+    if (IsExpanded()) {
+      for (auto &item : m_children) {
+        TreeItem *selected_item_ptr = item.GetItemForRowIndex(row_idx);
+        if (selected_item_ptr)
+          return selected_item_ptr;
+      }
+    }
+    return nullptr;
+  }
+
+  void *GetUserData() const { return m_user_data; }
+
+  void SetUserData(void *user_data) { m_user_data = user_data; }
+
+  uint64_t GetIdentifier() const { return m_identifier; }
+
+  void SetIdentifier(uint64_t identifier) { m_identifier = identifier; }
+
+  void SetMightHaveChildren(bool b) { m_might_have_children = b; }
+
+protected:
+  TreeItem *m_parent;
+  TreeDelegate &m_delegate;
+  void *m_user_data;
+  uint64_t m_identifier;
+  int m_row_idx; // Zero based visible row index, -1 if not visible or for the
+                 // root item
+  std::vector<TreeItem> m_children;
+  bool m_might_have_children;
+  bool m_is_expanded;
+};
+
+class TreeWindowDelegate : public WindowDelegate {
+public:
+  TreeWindowDelegate(Debugger &debugger, const TreeDelegateSP &delegate_sp)
+      : m_debugger(debugger), m_delegate_sp(delegate_sp),
+        m_root(nullptr, *delegate_sp, true), m_selected_item(nullptr),
+        m_num_rows(0), m_selected_row_idx(0), m_first_visible_row(0),
+        m_min_x(0), m_min_y(0), m_max_x(0), m_max_y(0) {}
+
+  int NumVisibleRows() const { return m_max_y - m_min_y; }
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    ExecutionContext exe_ctx(
+        m_debugger.GetCommandInterpreter().GetExecutionContext());
+    Process *process = exe_ctx.GetProcessPtr();
+
+    bool display_content = false;
+    if (process) {
+      StateType state = process->GetState();
+      if (StateIsStoppedState(state, true)) {
+        // We are stopped, so it is ok to
+        display_content = true;
+      } else if (StateIsRunningState(state)) {
+        return true; // Don't do any updating when we are running
+      }
+    }
+
+    m_min_x = 2;
+    m_min_y = 1;
+    m_max_x = window.GetWidth() - 1;
+    m_max_y = window.GetHeight() - 1;
+
+    window.Erase();
+    window.DrawTitleBox(window.GetName());
+
+    if (display_content) {
+      const int num_visible_rows = NumVisibleRows();
+      m_num_rows = 0;
+      m_root.CalculateRowIndexes(m_num_rows);
+
+      // If we unexpanded while having something selected our total number of
+      // rows is less than the num visible rows, then make sure we show all the
+      // rows by setting the first visible row accordingly.
+      if (m_first_visible_row > 0 && m_num_rows < num_visible_rows)
+        m_first_visible_row = 0;
+
+      // Make sure the selected row is always visible
+      if (m_selected_row_idx < m_first_visible_row)
+        m_first_visible_row = m_selected_row_idx;
+      else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
+        m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
+
+      int row_idx = 0;
+      int num_rows_left = num_visible_rows;
+      m_root.Draw(window, m_first_visible_row, m_selected_row_idx, row_idx,
+                  num_rows_left);
+      // Get the selected row
+      m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+    } else {
+      m_selected_item = nullptr;
+    }
+
+    return true; // Drawing handled
+  }
+
+  const char *WindowDelegateGetHelpText() override {
+    return "Thread window keyboard shortcuts:";
+  }
+
+  KeyHelp *WindowDelegateGetKeyHelp() override {
+    static curses::KeyHelp g_source_view_key_help[] = {
+        {KEY_UP, "Select previous item"},
+        {KEY_DOWN, "Select next item"},
+        {KEY_RIGHT, "Expand the selected item"},
+        {KEY_LEFT,
+         "Unexpand the selected item or select parent if not expanded"},
+        {KEY_PPAGE, "Page up"},
+        {KEY_NPAGE, "Page down"},
+        {'h', "Show help dialog"},
+        {' ', "Toggle item expansion"},
+        {',', "Page up"},
+        {'.', "Page down"},
+        {'\0', nullptr}};
+    return g_source_view_key_help;
+  }
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
+    switch (c) {
+    case ',':
+    case KEY_PPAGE:
+      // Page up key
+      if (m_first_visible_row > 0) {
+        if (m_first_visible_row > m_max_y)
+          m_first_visible_row -= m_max_y;
+        else
+          m_first_visible_row = 0;
+        m_selected_row_idx = m_first_visible_row;
+        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+        if (m_selected_item)
+          m_selected_item->ItemWasSelected();
+      }
+      return eKeyHandled;
+
+    case '.':
+    case KEY_NPAGE:
+      // Page down key
+      if (m_num_rows > m_max_y) {
+        if (m_first_visible_row + m_max_y < m_num_rows) {
+          m_first_visible_row += m_max_y;
+          m_selected_row_idx = m_first_visible_row;
+          m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+          if (m_selected_item)
+            m_selected_item->ItemWasSelected();
+        }
+      }
+      return eKeyHandled;
+
+    case KEY_UP:
+      if (m_selected_row_idx > 0) {
+        --m_selected_row_idx;
+        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+        if (m_selected_item)
+          m_selected_item->ItemWasSelected();
+      }
+      return eKeyHandled;
+
+    case KEY_DOWN:
+      if (m_selected_row_idx + 1 < m_num_rows) {
+        ++m_selected_row_idx;
+        m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+        if (m_selected_item)
+          m_selected_item->ItemWasSelected();
+      }
+      return eKeyHandled;
+
+    case KEY_RIGHT:
+      if (m_selected_item) {
+        if (!m_selected_item->IsExpanded())
+          m_selected_item->Expand();
+      }
+      return eKeyHandled;
+
+    case KEY_LEFT:
+      if (m_selected_item) {
+        if (m_selected_item->IsExpanded())
+          m_selected_item->Unexpand();
+        else if (m_selected_item->GetParent()) {
+          m_selected_row_idx = m_selected_item->GetParent()->GetRowIndex();
+          m_selected_item = m_root.GetItemForRowIndex(m_selected_row_idx);
+          if (m_selected_item)
+            m_selected_item->ItemWasSelected();
+        }
+      }
+      return eKeyHandled;
+
+    case ' ':
+      // Toggle expansion state when SPACE is pressed
+      if (m_selected_item) {
+        if (m_selected_item->IsExpanded())
+          m_selected_item->Unexpand();
+        else
+          m_selected_item->Expand();
+      }
+      return eKeyHandled;
+
+    case 'h':
+      window.CreateHelpSubwindow();
+      return eKeyHandled;
+
+    default:
+      break;
+    }
+    return eKeyNotHandled;
+  }
+
+protected:
+  Debugger &m_debugger;
+  TreeDelegateSP m_delegate_sp;
+  TreeItem m_root;
+  TreeItem *m_selected_item;
+  int m_num_rows;
+  int m_selected_row_idx;
+  int m_first_visible_row;
+  int m_min_x;
+  int m_min_y;
+  int m_max_x;
+  int m_max_y;
+};
+
+class FrameTreeDelegate : public TreeDelegate {
+public:
+  FrameTreeDelegate() : TreeDelegate() {
+    FormatEntity::Parse(
+        "frame #${frame.index}: {${function.name}${function.pc-offset}}}",
+        m_format);
+  }
+
+  ~FrameTreeDelegate() override = default;
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    Thread *thread = (Thread *)item.GetUserData();
+    if (thread) {
+      const uint64_t frame_idx = item.GetIdentifier();
+      StackFrameSP frame_sp = thread->GetStackFrameAtIndex(frame_idx);
+      if (frame_sp) {
+        StreamString strm;
+        const SymbolContext &sc =
+            frame_sp->GetSymbolContext(eSymbolContextEverything);
+        ExecutionContext exe_ctx(frame_sp);
+        if (FormatEntity::Format(m_format, strm, &sc, &exe_ctx, nullptr,
+                                 nullptr, false, false)) {
+          int right_pad = 1;
+          window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
+        }
+      }
+    }
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    // No children for frames yet...
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override {
+    Thread *thread = (Thread *)item.GetUserData();
+    if (thread) {
+      thread->GetProcess()->GetThreadList().SetSelectedThreadByID(
+          thread->GetID());
+      const uint64_t frame_idx = item.GetIdentifier();
+      thread->SetSelectedFrameByIndex(frame_idx);
+      return true;
+    }
+    return false;
+  }
+
+protected:
+  FormatEntity::Entry m_format;
+};
+
+class ThreadTreeDelegate : public TreeDelegate {
+public:
+  ThreadTreeDelegate(Debugger &debugger)
+      : TreeDelegate(), m_debugger(debugger), m_tid(LLDB_INVALID_THREAD_ID),
+        m_stop_id(UINT32_MAX) {
+    FormatEntity::Parse("thread #${thread.index}: tid = ${thread.id}{, stop "
+                        "reason = ${thread.stop-reason}}",
+                        m_format);
+  }
+
+  ~ThreadTreeDelegate() override = default;
+
+  ProcessSP GetProcess() {
+    return m_debugger.GetCommandInterpreter()
+        .GetExecutionContext()
+        .GetProcessSP();
+  }
+
+  ThreadSP GetThread(const TreeItem &item) {
+    ProcessSP process_sp = GetProcess();
+    if (process_sp)
+      return process_sp->GetThreadList().FindThreadByID(item.GetIdentifier());
+    return ThreadSP();
+  }
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    ThreadSP thread_sp = GetThread(item);
+    if (thread_sp) {
+      StreamString strm;
+      ExecutionContext exe_ctx(thread_sp);
+      if (FormatEntity::Format(m_format, strm, nullptr, &exe_ctx, nullptr,
+                               nullptr, false, false)) {
+        int right_pad = 1;
+        window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
+      }
+    }
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    ProcessSP process_sp = GetProcess();
+    if (process_sp && process_sp->IsAlive()) {
+      StateType state = process_sp->GetState();
+      if (StateIsStoppedState(state, true)) {
+        ThreadSP thread_sp = GetThread(item);
+        if (thread_sp) {
+          if (m_stop_id == process_sp->GetStopID() &&
+              thread_sp->GetID() == m_tid)
+            return; // Children are already up to date
+          if (!m_frame_delegate_sp) {
+            // Always expand the thread item the first time we show it
+            m_frame_delegate_sp = std::make_shared<FrameTreeDelegate>();
+          }
+
+          m_stop_id = process_sp->GetStopID();
+          m_tid = thread_sp->GetID();
+
+          TreeItem t(&item, *m_frame_delegate_sp, false);
+          size_t num_frames = thread_sp->GetStackFrameCount();
+          item.Resize(num_frames, t);
+          for (size_t i = 0; i < num_frames; ++i) {
+            item[i].SetUserData(thread_sp.get());
+            item[i].SetIdentifier(i);
+          }
+        }
+        return;
+      }
+    }
+    item.ClearChildren();
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override {
+    ProcessSP process_sp = GetProcess();
+    if (process_sp && process_sp->IsAlive()) {
+      StateType state = process_sp->GetState();
+      if (StateIsStoppedState(state, true)) {
+        ThreadSP thread_sp = GetThread(item);
+        if (thread_sp) {
+          ThreadList &thread_list = thread_sp->GetProcess()->GetThreadList();
+          std::lock_guard<std::recursive_mutex> guard(thread_list.GetMutex());
+          ThreadSP selected_thread_sp = thread_list.GetSelectedThread();
+          if (selected_thread_sp->GetID() != thread_sp->GetID()) {
+            thread_list.SetSelectedThreadByID(thread_sp->GetID());
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+protected:
+  Debugger &m_debugger;
+  std::shared_ptr<FrameTreeDelegate> m_frame_delegate_sp;
+  lldb::user_id_t m_tid;
+  uint32_t m_stop_id;
+  FormatEntity::Entry m_format;
+};
+
+class ThreadsTreeDelegate : public TreeDelegate {
+public:
+  ThreadsTreeDelegate(Debugger &debugger)
+      : TreeDelegate(), m_thread_delegate_sp(), m_debugger(debugger),
+        m_stop_id(UINT32_MAX) {
+    FormatEntity::Parse("process ${process.id}{, name = ${process.name}}",
+                        m_format);
+  }
+
+  ~ThreadsTreeDelegate() override = default;
+
+  ProcessSP GetProcess() {
+    return m_debugger.GetCommandInterpreter()
+        .GetExecutionContext()
+        .GetProcessSP();
+  }
+
+  void TreeDelegateDrawTreeItem(TreeItem &item, Window &window) override {
+    ProcessSP process_sp = GetProcess();
+    if (process_sp && process_sp->IsAlive()) {
+      StreamString strm;
+      ExecutionContext exe_ctx(process_sp);
+      if (FormatEntity::Format(m_format, strm, nullptr, &exe_ctx, nullptr,
+                               nullptr, false, false)) {
+        int right_pad = 1;
+        window.PutCStringTruncated(strm.GetString().str().c_str(), right_pad);
+      }
+    }
+  }
+
+  void TreeDelegateGenerateChildren(TreeItem &item) override {
+    ProcessSP process_sp = GetProcess();
+    if (process_sp && process_sp->IsAlive()) {
+      StateType state = process_sp->GetState();
+      if (StateIsStoppedState(state, true)) {
+        const uint32_t stop_id = process_sp->GetStopID();
+        if (m_stop_id == stop_id)
+          return; // Children are already up to date
+
+        m_stop_id = stop_id;
+
+        if (!m_thread_delegate_sp) {
+          // Always expand the thread item the first time we show it
+          // item.Expand();
+          m_thread_delegate_sp =
+              std::make_shared<ThreadTreeDelegate>(m_debugger);
+        }
+
+        TreeItem t(&item, *m_thread_delegate_sp, false);
+        ThreadList &threads = process_sp->GetThreadList();
+        std::lock_guard<std::recursive_mutex> guard(threads.GetMutex());
+        size_t num_threads = threads.GetSize();
+        item.Resize(num_threads, t);
+        for (size_t i = 0; i < num_threads; ++i) {
+          item[i].SetIdentifier(threads.GetThreadAtIndex(i)->GetID());
+          item[i].SetMightHaveChildren(true);
+        }
+        return;
+      }
+    }
+    item.ClearChildren();
+  }
+
+  bool TreeDelegateItemSelected(TreeItem &item) override { return false; }
+
+protected:
+  std::shared_ptr<ThreadTreeDelegate> m_thread_delegate_sp;
+  Debugger &m_debugger;
+  uint32_t m_stop_id;
+  FormatEntity::Entry m_format;
+};
+
+class ValueObjectListDelegate : public WindowDelegate {
+public:
+  ValueObjectListDelegate()
+      : m_rows(), m_selected_row(nullptr), m_selected_row_idx(0),
+        m_first_visible_row(0), m_num_rows(0), m_max_x(0), m_max_y(0) {}
+
+  ValueObjectListDelegate(ValueObjectList &valobj_list)
+      : m_rows(), m_selected_row(nullptr), m_selected_row_idx(0),
+        m_first_visible_row(0), m_num_rows(0), m_max_x(0), m_max_y(0) {
+    SetValues(valobj_list);
+  }
+
+  ~ValueObjectListDelegate() override = default;
+
+  void SetValues(ValueObjectList &valobj_list) {
+    m_selected_row = nullptr;
+    m_selected_row_idx = 0;
+    m_first_visible_row = 0;
+    m_num_rows = 0;
+    m_rows.clear();
+    for (auto &valobj_sp : valobj_list.GetObjects())
+      m_rows.push_back(Row(valobj_sp, nullptr));
+  }
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    m_num_rows = 0;
+    m_min_x = 2;
+    m_min_y = 1;
+    m_max_x = window.GetWidth() - 1;
+    m_max_y = window.GetHeight() - 1;
+
+    window.Erase();
+    window.DrawTitleBox(window.GetName());
+
+    const int num_visible_rows = NumVisibleRows();
+    const int num_rows = CalculateTotalNumberRows(m_rows);
+
+    // If we unexpanded while having something selected our total number of
+    // rows is less than the num visible rows, then make sure we show all the
+    // rows by setting the first visible row accordingly.
+    if (m_first_visible_row > 0 && num_rows < num_visible_rows)
+      m_first_visible_row = 0;
+
+    // Make sure the selected row is always visible
+    if (m_selected_row_idx < m_first_visible_row)
+      m_first_visible_row = m_selected_row_idx;
+    else if (m_first_visible_row + num_visible_rows <= m_selected_row_idx)
+      m_first_visible_row = m_selected_row_idx - num_visible_rows + 1;
+
+    DisplayRows(window, m_rows, g_options);
+
+    // Get the selected row
+    m_selected_row = GetRowForRowIndex(m_selected_row_idx);
+    // Keep the cursor on the selected row so the highlight and the cursor are
+    // always on the same line
+    if (m_selected_row)
+      window.MoveCursor(m_selected_row->x, m_selected_row->y);
+
+    return true; // Drawing handled
+  }
+
+  KeyHelp *WindowDelegateGetKeyHelp() override {
+    static curses::KeyHelp g_source_view_key_help[] = {
+        {KEY_UP, "Select previous item"},
+        {KEY_DOWN, "Select next item"},
+        {KEY_RIGHT, "Expand selected item"},
+        {KEY_LEFT, "Unexpand selected item or select parent if not expanded"},
+        {KEY_PPAGE, "Page up"},
+        {KEY_NPAGE, "Page down"},
+        {'A', "Format as annotated address"},
+        {'b', "Format as binary"},
+        {'B', "Format as hex bytes with ASCII"},
+        {'c', "Format as character"},
+        {'d', "Format as a signed integer"},
+        {'D', "Format selected value using the default format for the type"},
+        {'f', "Format as float"},
+        {'h', "Show help dialog"},
+        {'i', "Format as instructions"},
+        {'o', "Format as octal"},
+        {'p', "Format as pointer"},
+        {'s', "Format as C string"},
+        {'t', "Toggle showing/hiding type names"},
+        {'u', "Format as an unsigned integer"},
+        {'x', "Format as hex"},
+        {'X', "Format as uppercase hex"},
+        {' ', "Toggle item expansion"},
+        {',', "Page up"},
+        {'.', "Page down"},
+        {'\0', nullptr}};
+    return g_source_view_key_help;
+  }
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
+    switch (c) {
+    case 'x':
+    case 'X':
+    case 'o':
+    case 's':
+    case 'u':
+    case 'd':
+    case 'D':
+    case 'i':
+    case 'A':
+    case 'p':
+    case 'c':
+    case 'b':
+    case 'B':
+    case 'f':
+      // Change the format for the currently selected item
+      if (m_selected_row) {
+        auto valobj_sp = m_selected_row->value.GetSP();
+        if (valobj_sp)
+          valobj_sp->SetFormat(FormatForChar(c));
+      }
+      return eKeyHandled;
+
+    case 't':
+      // Toggle showing type names
+      g_options.show_types = !g_options.show_types;
+      return eKeyHandled;
+
+    case ',':
+    case KEY_PPAGE:
+      // Page up key
+      if (m_first_visible_row > 0) {
+        if (static_cast<int>(m_first_visible_row) > m_max_y)
+          m_first_visible_row -= m_max_y;
+        else
+          m_first_visible_row = 0;
+        m_selected_row_idx = m_first_visible_row;
+      }
+      return eKeyHandled;
+
+    case '.':
+    case KEY_NPAGE:
+      // Page down key
+      if (m_num_rows > static_cast<size_t>(m_max_y)) {
+        if (m_first_visible_row + m_max_y < m_num_rows) {
+          m_first_visible_row += m_max_y;
+          m_selected_row_idx = m_first_visible_row;
+        }
+      }
+      return eKeyHandled;
+
+    case KEY_UP:
+      if (m_selected_row_idx > 0)
+        --m_selected_row_idx;
+      return eKeyHandled;
+
+    case KEY_DOWN:
+      if (m_selected_row_idx + 1 < m_num_rows)
+        ++m_selected_row_idx;
+      return eKeyHandled;
+
+    case KEY_RIGHT:
+      if (m_selected_row) {
+        if (!m_selected_row->expanded)
+          m_selected_row->Expand();
+      }
+      return eKeyHandled;
+
+    case KEY_LEFT:
+      if (m_selected_row) {
+        if (m_selected_row->expanded)
+          m_selected_row->Unexpand();
+        else if (m_selected_row->parent)
+          m_selected_row_idx = m_selected_row->parent->row_idx;
+      }
+      return eKeyHandled;
+
+    case ' ':
+      // Toggle expansion state when SPACE is pressed
+      if (m_selected_row) {
+        if (m_selected_row->expanded)
+          m_selected_row->Unexpand();
+        else
+          m_selected_row->Expand();
+      }
+      return eKeyHandled;
+
+    case 'h':
+      window.CreateHelpSubwindow();
+      return eKeyHandled;
+
+    default:
+      break;
+    }
+    return eKeyNotHandled;
+  }
+
+protected:
+  std::vector<Row> m_rows;
+  Row *m_selected_row;
+  uint32_t m_selected_row_idx;
+  uint32_t m_first_visible_row;
+  uint32_t m_num_rows;
+  int m_min_x;
+  int m_min_y;
+  int m_max_x;
+  int m_max_y;
+
+  static Format FormatForChar(int c) {
+    switch (c) {
+    case 'x':
+      return eFormatHex;
+    case 'X':
+      return eFormatHexUppercase;
+    case 'o':
+      return eFormatOctal;
+    case 's':
+      return eFormatCString;
+    case 'u':
+      return eFormatUnsigned;
+    case 'd':
+      return eFormatDecimal;
+    case 'D':
+      return eFormatDefault;
+    case 'i':
+      return eFormatInstruction;
+    case 'A':
+      return eFormatAddressInfo;
+    case 'p':
+      return eFormatPointer;
+    case 'c':
+      return eFormatChar;
+    case 'b':
+      return eFormatBinary;
+    case 'B':
+      return eFormatBytesWithASCII;
+    case 'f':
+      return eFormatFloat;
+    }
+    return eFormatDefault;
+  }
+
+  bool DisplayRowObject(Window &window, Row &row, DisplayOptions &options,
+                        bool highlight, bool last_child) {
+    ValueObject *valobj = row.value.GetSP().get();
+
+    if (valobj == nullptr)
+      return false;
+
+    const char *type_name =
+        options.show_types ? valobj->GetTypeName().GetCString() : nullptr;
+    const char *name = valobj->GetName().GetCString();
+    const char *value = valobj->GetValueAsCString();
+    const char *summary = valobj->GetSummaryAsCString();
+
+    window.MoveCursor(row.x, row.y);
+
+    row.DrawTree(window);
+
+    if (highlight)
+      window.AttributeOn(A_REVERSE);
+
+    if (type_name && type_name[0])
+      window.Printf("(%s) ", type_name);
+
+    if (name && name[0])
+      window.PutCString(name);
+
+    attr_t changd_attr = 0;
+    if (valobj->GetValueDidChange())
+      changd_attr = COLOR_PAIR(5) | A_BOLD;
+
+    if (value && value[0]) {
+      window.PutCString(" = ");
+      if (changd_attr)
+        window.AttributeOn(changd_attr);
+      window.PutCString(value);
+      if (changd_attr)
+        window.AttributeOff(changd_attr);
+    }
+
+    if (summary && summary[0]) {
+      window.PutChar(' ');
+      if (changd_attr)
+        window.AttributeOn(changd_attr);
+      window.PutCString(summary);
+      if (changd_attr)
+        window.AttributeOff(changd_attr);
+    }
+
+    if (highlight)
+      window.AttributeOff(A_REVERSE);
+
+    return true;
+  }
+
+  void DisplayRows(Window &window, std::vector<Row> &rows,
+                   DisplayOptions &options) {
+    // >   0x25B7
+    // \/  0x25BD
+
+    bool window_is_active = window.IsActive();
+    for (auto &row : rows) {
+      const bool last_child = row.parent && &rows[rows.size() - 1] == &row;
+      // Save the row index in each Row structure
+      row.row_idx = m_num_rows;
+      if ((m_num_rows >= m_first_visible_row) &&
+          ((m_num_rows - m_first_visible_row) <
+           static_cast<size_t>(NumVisibleRows()))) {
+        row.x = m_min_x;
+        row.y = m_num_rows - m_first_visible_row + 1;
+        if (DisplayRowObject(window, row, options,
+                             window_is_active &&
+                                 m_num_rows == m_selected_row_idx,
+                             last_child)) {
+          ++m_num_rows;
+        } else {
+          row.x = 0;
+          row.y = 0;
+        }
+      } else {
+        row.x = 0;
+        row.y = 0;
+        ++m_num_rows;
+      }
+
+      auto &children = row.GetChildren();
+      if (row.expanded && !children.empty()) {
+        DisplayRows(window, children, options);
+      }
+    }
+  }
+
+  int CalculateTotalNumberRows(std::vector<Row> &rows) {
+    int row_count = 0;
+    for (auto &row : rows) {
+      ++row_count;
+      if (row.expanded)
+        row_count += CalculateTotalNumberRows(row.GetChildren());
+    }
+    return row_count;
+  }
+
+  static Row *GetRowForRowIndexImpl(std::vector<Row> &rows, size_t &row_index) {
+    for (auto &row : rows) {
+      if (row_index == 0)
+        return &row;
+      else {
+        --row_index;
+        auto &children = row.GetChildren();
+        if (row.expanded && !children.empty()) {
+          Row *result = GetRowForRowIndexImpl(children, row_index);
+          if (result)
+            return result;
+        }
+      }
+    }
+    return nullptr;
+  }
+
+  Row *GetRowForRowIndex(size_t row_index) {
+    return GetRowForRowIndexImpl(m_rows, row_index);
+  }
+
+  int NumVisibleRows() const { return m_max_y - m_min_y; }
+
+  static DisplayOptions g_options;
+};
+
+class FrameVariablesWindowDelegate : public ValueObjectListDelegate {
+public:
+  FrameVariablesWindowDelegate(Debugger &debugger)
+      : ValueObjectListDelegate(), m_debugger(debugger),
+        m_frame_block(nullptr) {}
+
+  ~FrameVariablesWindowDelegate() override = default;
+
+  const char *WindowDelegateGetHelpText() override {
+    return "Frame variable window keyboard shortcuts:";
+  }
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    ExecutionContext exe_ctx(
+        m_debugger.GetCommandInterpreter().GetExecutionContext());
+    Process *process = exe_ctx.GetProcessPtr();
+    Block *frame_block = nullptr;
+    StackFrame *frame = nullptr;
+
+    if (process) {
+      StateType state = process->GetState();
+      if (StateIsStoppedState(state, true)) {
+        frame = exe_ctx.GetFramePtr();
+        if (frame)
+          frame_block = frame->GetFrameBlock();
+      } else if (StateIsRunningState(state)) {
+        return true; // Don't do any updating when we are running
+      }
+    }
+
+    ValueObjectList local_values;
+    if (frame_block) {
+      // Only update the variables if they have changed
+      if (m_frame_block != frame_block) {
+        m_frame_block = frame_block;
+
+        VariableList *locals = frame->GetVariableList(true);
+        if (locals) {
+          const DynamicValueType use_dynamic = eDynamicDontRunTarget;
+          for (const VariableSP &local_sp : *locals) {
+            ValueObjectSP value_sp =
+                frame->GetValueObjectForFrameVariable(local_sp, use_dynamic);
+            if (value_sp) {
+              ValueObjectSP synthetic_value_sp = value_sp->GetSyntheticValue();
+              if (synthetic_value_sp)
+                local_values.Append(synthetic_value_sp);
+              else
+                local_values.Append(value_sp);
+            }
+          }
+          // Update the values
+          SetValues(local_values);
+        }
+      }
+    } else {
+      m_frame_block = nullptr;
+      // Update the values with an empty list if there is no frame
+      SetValues(local_values);
+    }
+
+    return ValueObjectListDelegate::WindowDelegateDraw(window, force);
+  }
+
+protected:
+  Debugger &m_debugger;
+  Block *m_frame_block;
+};
+
+class RegistersWindowDelegate : public ValueObjectListDelegate {
+public:
+  RegistersWindowDelegate(Debugger &debugger)
+      : ValueObjectListDelegate(), m_debugger(debugger) {}
+
+  ~RegistersWindowDelegate() override = default;
+
+  const char *WindowDelegateGetHelpText() override {
+    return "Register window keyboard shortcuts:";
+  }
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    ExecutionContext exe_ctx(
+        m_debugger.GetCommandInterpreter().GetExecutionContext());
+    StackFrame *frame = exe_ctx.GetFramePtr();
+
+    ValueObjectList value_list;
+    if (frame) {
+      if (frame->GetStackID() != m_stack_id) {
+        m_stack_id = frame->GetStackID();
+        RegisterContextSP reg_ctx(frame->GetRegisterContext());
+        if (reg_ctx) {
+          const uint32_t num_sets = reg_ctx->GetRegisterSetCount();
+          for (uint32_t set_idx = 0; set_idx < num_sets; ++set_idx) {
+            value_list.Append(
+                ValueObjectRegisterSet::Create(frame, reg_ctx, set_idx));
+          }
+        }
+        SetValues(value_list);
+      }
+    } else {
+      Process *process = exe_ctx.GetProcessPtr();
+      if (process && process->IsAlive())
+        return true; // Don't do any updating if we are running
+      else {
+        // Update the values with an empty list if there is no process or the
+        // process isn't alive anymore
+        SetValues(value_list);
+      }
+    }
+    return ValueObjectListDelegate::WindowDelegateDraw(window, force);
+  }
+
+protected:
+  Debugger &m_debugger;
+  StackID m_stack_id;
+};
+
+static const char *CursesKeyToCString(int ch) {
+  static char g_desc[32];
+  if (ch >= KEY_F0 && ch < KEY_F0 + 64) {
+    snprintf(g_desc, sizeof(g_desc), "F%u", ch - KEY_F0);
+    return g_desc;
+  }
+  switch (ch) {
+  case KEY_DOWN:
+    return "down";
+  case KEY_UP:
+    return "up";
+  case KEY_LEFT:
+    return "left";
+  case KEY_RIGHT:
+    return "right";
+  case KEY_HOME:
+    return "home";
+  case KEY_BACKSPACE:
+    return "backspace";
+  case KEY_DL:
+    return "delete-line";
+  case KEY_IL:
+    return "insert-line";
+  case KEY_DC:
+    return "delete-char";
+  case KEY_IC:
+    return "insert-char";
+  case KEY_CLEAR:
+    return "clear";
+  case KEY_EOS:
+    return "clear-to-eos";
+  case KEY_EOL:
+    return "clear-to-eol";
+  case KEY_SF:
+    return "scroll-forward";
+  case KEY_SR:
+    return "scroll-backward";
+  case KEY_NPAGE:
+    return "page-down";
+  case KEY_PPAGE:
+    return "page-up";
+  case KEY_STAB:
+    return "set-tab";
+  case KEY_CTAB:
+    return "clear-tab";
+  case KEY_CATAB:
+    return "clear-all-tabs";
+  case KEY_ENTER:
+    return "enter";
+  case KEY_PRINT:
+    return "print";
+  case KEY_LL:
+    return "lower-left key";
+  case KEY_A1:
+    return "upper left of keypad";
+  case KEY_A3:
+    return "upper right of keypad";
+  case KEY_B2:
+    return "center of keypad";
+  case KEY_C1:
+    return "lower left of keypad";
+  case KEY_C3:
+    return "lower right of keypad";
+  case KEY_BTAB:
+    return "back-tab key";
+  case KEY_BEG:
+    return "begin key";
+  case KEY_CANCEL:
+    return "cancel key";
+  case KEY_CLOSE:
+    return "close key";
+  case KEY_COMMAND:
+    return "command key";
+  case KEY_COPY:
+    return "copy key";
+  case KEY_CREATE:
+    return "create key";
+  case KEY_END:
+    return "end key";
+  case KEY_EXIT:
+    return "exit key";
+  case KEY_FIND:
+    return "find key";
+  case KEY_HELP:
+    return "help key";
+  case KEY_MARK:
+    return "mark key";
+  case KEY_MESSAGE:
+    return "message key";
+  case KEY_MOVE:
+    return "move key";
+  case KEY_NEXT:
+    return "next key";
+  case KEY_OPEN:
+    return "open key";
+  case KEY_OPTIONS:
+    return "options key";
+  case KEY_PREVIOUS:
+    return "previous key";
+  case KEY_REDO:
+    return "redo key";
+  case KEY_REFERENCE:
+    return "reference key";
+  case KEY_REFRESH:
+    return "refresh key";
+  case KEY_REPLACE:
+    return "replace key";
+  case KEY_RESTART:
+    return "restart key";
+  case KEY_RESUME:
+    return "resume key";
+  case KEY_SAVE:
+    return "save key";
+  case KEY_SBEG:
+    return "shifted begin key";
+  case KEY_SCANCEL:
+    return "shifted cancel key";
+  case KEY_SCOMMAND:
+    return "shifted command key";
+  case KEY_SCOPY:
+    return "shifted copy key";
+  case KEY_SCREATE:
+    return "shifted create key";
+  case KEY_SDC:
+    return "shifted delete-character key";
+  case KEY_SDL:
+    return "shifted delete-line key";
+  case KEY_SELECT:
+    return "select key";
+  case KEY_SEND:
+    return "shifted end key";
+  case KEY_SEOL:
+    return "shifted clear-to-end-of-line key";
+  case KEY_SEXIT:
+    return "shifted exit key";
+  case KEY_SFIND:
+    return "shifted find key";
+  case KEY_SHELP:
+    return "shifted help key";
+  case KEY_SHOME:
+    return "shifted home key";
+  case KEY_SIC:
+    return "shifted insert-character key";
+  case KEY_SLEFT:
+    return "shifted left-arrow key";
+  case KEY_SMESSAGE:
+    return "shifted message key";
+  case KEY_SMOVE:
+    return "shifted move key";
+  case KEY_SNEXT:
+    return "shifted next key";
+  case KEY_SOPTIONS:
+    return "shifted options key";
+  case KEY_SPREVIOUS:
+    return "shifted previous key";
+  case KEY_SPRINT:
+    return "shifted print key";
+  case KEY_SREDO:
+    return "shifted redo key";
+  case KEY_SREPLACE:
+    return "shifted replace key";
+  case KEY_SRIGHT:
+    return "shifted right-arrow key";
+  case KEY_SRSUME:
+    return "shifted resume key";
+  case KEY_SSAVE:
+    return "shifted save key";
+  case KEY_SSUSPEND:
+    return "shifted suspend key";
+  case KEY_SUNDO:
+    return "shifted undo key";
+  case KEY_SUSPEND:
+    return "suspend key";
+  case KEY_UNDO:
+    return "undo key";
+  case KEY_MOUSE:
+    return "Mouse event has occurred";
+  case KEY_RESIZE:
+    return "Terminal resize event";
+#ifdef KEY_EVENT
+  case KEY_EVENT:
+    return "We were interrupted by an event";
+#endif
+  case KEY_RETURN:
+    return "return";
+  case ' ':
+    return "space";
+  case '\t':
+    return "tab";
+  case KEY_ESCAPE:
+    return "escape";
+  default:
+    if (isprint(ch))
+      snprintf(g_desc, sizeof(g_desc), "%c", ch);
+    else
+      snprintf(g_desc, sizeof(g_desc), "\\x%2.2x", ch);
+    return g_desc;
+  }
+  return nullptr;
+}
+
+HelpDialogDelegate::HelpDialogDelegate(const char *text,
+                                       KeyHelp *key_help_array)
+    : m_text(), m_first_visible_line(0) {
+  if (text && text[0]) {
+    m_text.SplitIntoLines(text);
+    m_text.AppendString("");
+  }
+  if (key_help_array) {
+    for (KeyHelp *key = key_help_array; key->ch; ++key) {
+      StreamString key_description;
+      key_description.Printf("%10s - %s", CursesKeyToCString(key->ch),
+                             key->description);
+      m_text.AppendString(key_description.GetString());
+    }
+  }
+}
+
+HelpDialogDelegate::~HelpDialogDelegate() = default;
+
+bool HelpDialogDelegate::WindowDelegateDraw(Window &window, bool force) {
+  window.Erase();
+  const int window_height = window.GetHeight();
+  int x = 2;
+  int y = 1;
+  const int min_y = y;
+  const int max_y = window_height - 1 - y;
+  const size_t num_visible_lines = max_y - min_y + 1;
+  const size_t num_lines = m_text.GetSize();
+  const char *bottom_message;
+  if (num_lines <= num_visible_lines)
+    bottom_message = "Press any key to exit";
+  else
+    bottom_message = "Use arrows to scroll, any other key to exit";
+  window.DrawTitleBox(window.GetName(), bottom_message);
+  while (y <= max_y) {
+    window.MoveCursor(x, y);
+    window.PutCStringTruncated(
+        m_text.GetStringAtIndex(m_first_visible_line + y - min_y), 1);
+    ++y;
+  }
+  return true;
+}
+
+HandleCharResult HelpDialogDelegate::WindowDelegateHandleChar(Window &window,
+                                                              int key) {
+  bool done = false;
+  const size_t num_lines = m_text.GetSize();
+  const size_t num_visible_lines = window.GetHeight() - 2;
+
+  if (num_lines <= num_visible_lines) {
+    done = true;
+    // If we have all lines visible and don't need scrolling, then any key
+    // press will cause us to exit
+  } else {
+    switch (key) {
+    case KEY_UP:
+      if (m_first_visible_line > 0)
+        --m_first_visible_line;
+      break;
+
+    case KEY_DOWN:
+      if (m_first_visible_line + num_visible_lines < num_lines)
+        ++m_first_visible_line;
+      break;
+
+    case KEY_PPAGE:
+    case ',':
+      if (m_first_visible_line > 0) {
+        if (static_cast<size_t>(m_first_visible_line) >= num_visible_lines)
+          m_first_visible_line -= num_visible_lines;
+        else
+          m_first_visible_line = 0;
+      }
+      break;
+
+    case KEY_NPAGE:
+    case '.':
+      if (m_first_visible_line + num_visible_lines < num_lines) {
+        m_first_visible_line += num_visible_lines;
+        if (static_cast<size_t>(m_first_visible_line) > num_lines)
+          m_first_visible_line = num_lines - num_visible_lines;
+      }
+      break;
+
+    default:
+      done = true;
+      break;
+    }
+  }
+  if (done)
+    window.GetParent()->RemoveSubWindow(&window);
+  return eKeyHandled;
+}
+
+class ApplicationDelegate : public WindowDelegate, public MenuDelegate {
+public:
+  enum {
+    eMenuID_LLDB = 1,
+    eMenuID_LLDBAbout,
+    eMenuID_LLDBExit,
+
+    eMenuID_Target,
+    eMenuID_TargetCreate,
+    eMenuID_TargetDelete,
+
+    eMenuID_Process,
+    eMenuID_ProcessAttach,
+    eMenuID_ProcessDetach,
+    eMenuID_ProcessLaunch,
+    eMenuID_ProcessContinue,
+    eMenuID_ProcessHalt,
+    eMenuID_ProcessKill,
+
+    eMenuID_Thread,
+    eMenuID_ThreadStepIn,
+    eMenuID_ThreadStepOver,
+    eMenuID_ThreadStepOut,
+
+    eMenuID_View,
+    eMenuID_ViewBacktrace,
+    eMenuID_ViewRegisters,
+    eMenuID_ViewSource,
+    eMenuID_ViewVariables,
+
+    eMenuID_Help,
+    eMenuID_HelpGUIHelp
+  };
+
+  ApplicationDelegate(Application &app, Debugger &debugger)
+      : WindowDelegate(), MenuDelegate(), m_app(app), m_debugger(debugger) {}
+
+  ~ApplicationDelegate() override = default;
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    return false; // Drawing not handled, let standard window drawing happen
+  }
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int key) override {
+    switch (key) {
+    case '\t':
+      window.SelectNextWindowAsActive();
+      return eKeyHandled;
+
+    case 'h':
+      window.CreateHelpSubwindow();
+      return eKeyHandled;
+
+    case KEY_ESCAPE:
+      return eQuitApplication;
+
+    default:
+      break;
+    }
+    return eKeyNotHandled;
+  }
+
+  const char *WindowDelegateGetHelpText() override {
+    return "Welcome to the LLDB curses GUI.\n\n"
+           "Press the TAB key to change the selected view.\n"
+           "Each view has its own keyboard shortcuts, press 'h' to open a "
+           "dialog to display them.\n\n"
+           "Common key bindings for all views:";
+  }
+
+  KeyHelp *WindowDelegateGetKeyHelp() override {
+    static curses::KeyHelp g_source_view_key_help[] = {
+        {'\t', "Select next view"},
+        {'h', "Show help dialog with view specific key bindings"},
+        {',', "Page up"},
+        {'.', "Page down"},
+        {KEY_UP, "Select previous"},
+        {KEY_DOWN, "Select next"},
+        {KEY_LEFT, "Unexpand or select parent"},
+        {KEY_RIGHT, "Expand"},
+        {KEY_PPAGE, "Page up"},
+        {KEY_NPAGE, "Page down"},
+        {'\0', nullptr}};
+    return g_source_view_key_help;
+  }
+
+  MenuActionResult MenuDelegateAction(Menu &menu) override {
+    switch (menu.GetIdentifier()) {
+    case eMenuID_ThreadStepIn: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasThreadScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive() &&
+            StateIsStoppedState(process->GetState(), true))
+          exe_ctx.GetThreadRef().StepIn(true);
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ThreadStepOut: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasThreadScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive() &&
+            StateIsStoppedState(process->GetState(), true))
+          exe_ctx.GetThreadRef().StepOut();
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ThreadStepOver: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasThreadScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive() &&
+            StateIsStoppedState(process->GetState(), true))
+          exe_ctx.GetThreadRef().StepOver(true);
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ProcessContinue: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasProcessScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive() &&
+            StateIsStoppedState(process->GetState(), true))
+          process->Resume();
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ProcessKill: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasProcessScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive())
+          process->Destroy(false);
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ProcessHalt: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasProcessScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive())
+          process->Halt();
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ProcessDetach: {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasProcessScope()) {
+        Process *process = exe_ctx.GetProcessPtr();
+        if (process && process->IsAlive())
+          process->Detach(false);
+      }
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_Process: {
+      // Populate the menu with all of the threads if the process is stopped
+      // when the Process menu gets selected and is about to display its
+      // submenu.
+      Menus &submenus = menu.GetSubmenus();
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      Process *process = exe_ctx.GetProcessPtr();
+      if (process && process->IsAlive() &&
+          StateIsStoppedState(process->GetState(), true)) {
+        if (submenus.size() == 7)
+          menu.AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
+        else if (submenus.size() > 8)
+          submenus.erase(submenus.begin() + 8, submenus.end());
+
+        ThreadList &threads = process->GetThreadList();
+        std::lock_guard<std::recursive_mutex> guard(threads.GetMutex());
+        size_t num_threads = threads.GetSize();
+        for (size_t i = 0; i < num_threads; ++i) {
+          ThreadSP thread_sp = threads.GetThreadAtIndex(i);
+          char menu_char = '\0';
+          if (i < 9)
+            menu_char = '1' + i;
+          StreamString thread_menu_title;
+          thread_menu_title.Printf("Thread %u", thread_sp->GetIndexID());
+          const char *thread_name = thread_sp->GetName();
+          if (thread_name && thread_name[0])
+            thread_menu_title.Printf(" %s", thread_name);
+          else {
+            const char *queue_name = thread_sp->GetQueueName();
+            if (queue_name && queue_name[0])
+              thread_menu_title.Printf(" %s", queue_name);
+          }
+          menu.AddSubmenu(
+              MenuSP(new Menu(thread_menu_title.GetString().str().c_str(),
+                              nullptr, menu_char, thread_sp->GetID())));
+        }
+      } else if (submenus.size() > 7) {
+        // Remove the separator and any other thread submenu items that were
+        // previously added
+        submenus.erase(submenus.begin() + 7, submenus.end());
+      }
+      // Since we are adding and removing items we need to recalculate the name
+      // lengths
+      menu.RecalculateNameLengths();
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ViewVariables: {
+      WindowSP main_window_sp = m_app.GetMainWindow();
+      WindowSP source_window_sp = main_window_sp->FindSubWindow("Source");
+      WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables");
+      WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers");
+      const Rect source_bounds = source_window_sp->GetBounds();
+
+      if (variables_window_sp) {
+        const Rect variables_bounds = variables_window_sp->GetBounds();
+
+        main_window_sp->RemoveSubWindow(variables_window_sp.get());
+
+        if (registers_window_sp) {
+          // We have a registers window, so give all the area back to the
+          // registers window
+          Rect registers_bounds = variables_bounds;
+          registers_bounds.size.width = source_bounds.size.width;
+          registers_window_sp->SetBounds(registers_bounds);
+        } else {
+          // We have no registers window showing so give the bottom area back
+          // to the source view
+          source_window_sp->Resize(source_bounds.size.width,
+                                   source_bounds.size.height +
+                                       variables_bounds.size.height);
+        }
+      } else {
+        Rect new_variables_rect;
+        if (registers_window_sp) {
+          // We have a registers window so split the area of the registers
+          // window into two columns where the left hand side will be the
+          // variables and the right hand side will be the registers
+          const Rect variables_bounds = registers_window_sp->GetBounds();
+          Rect new_registers_rect;
+          variables_bounds.VerticalSplitPercentage(0.50, new_variables_rect,
+                                                   new_registers_rect);
+          registers_window_sp->SetBounds(new_registers_rect);
+        } else {
+          // No variables window, grab the bottom part of the source window
+          Rect new_source_rect;
+          source_bounds.HorizontalSplitPercentage(0.70, new_source_rect,
+                                                  new_variables_rect);
+          source_window_sp->SetBounds(new_source_rect);
+        }
+        WindowSP new_window_sp = main_window_sp->CreateSubWindow(
+            "Variables", new_variables_rect, false);
+        new_window_sp->SetDelegate(
+            WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger)));
+      }
+      touchwin(stdscr);
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_ViewRegisters: {
+      WindowSP main_window_sp = m_app.GetMainWindow();
+      WindowSP source_window_sp = main_window_sp->FindSubWindow("Source");
+      WindowSP variables_window_sp = main_window_sp->FindSubWindow("Variables");
+      WindowSP registers_window_sp = main_window_sp->FindSubWindow("Registers");
+      const Rect source_bounds = source_window_sp->GetBounds();
+
+      if (registers_window_sp) {
+        if (variables_window_sp) {
+          const Rect variables_bounds = variables_window_sp->GetBounds();
+
+          // We have a variables window, so give all the area back to the
+          // variables window
+          variables_window_sp->Resize(variables_bounds.size.width +
+                                          registers_window_sp->GetWidth(),
+                                      variables_bounds.size.height);
+        } else {
+          // We have no variables window showing so give the bottom area back
+          // to the source view
+          source_window_sp->Resize(source_bounds.size.width,
+                                   source_bounds.size.height +
+                                       registers_window_sp->GetHeight());
+        }
+        main_window_sp->RemoveSubWindow(registers_window_sp.get());
+      } else {
+        Rect new_regs_rect;
+        if (variables_window_sp) {
+          // We have a variables window, split it into two columns where the
+          // left hand side will be the variables and the right hand side will
+          // be the registers
+          const Rect variables_bounds = variables_window_sp->GetBounds();
+          Rect new_vars_rect;
+          variables_bounds.VerticalSplitPercentage(0.50, new_vars_rect,
+                                                   new_regs_rect);
+          variables_window_sp->SetBounds(new_vars_rect);
+        } else {
+          // No registers window, grab the bottom part of the source window
+          Rect new_source_rect;
+          source_bounds.HorizontalSplitPercentage(0.70, new_source_rect,
+                                                  new_regs_rect);
+          source_window_sp->SetBounds(new_source_rect);
+        }
+        WindowSP new_window_sp =
+            main_window_sp->CreateSubWindow("Registers", new_regs_rect, false);
+        new_window_sp->SetDelegate(
+            WindowDelegateSP(new RegistersWindowDelegate(m_debugger)));
+      }
+      touchwin(stdscr);
+    }
+      return MenuActionResult::Handled;
+
+    case eMenuID_HelpGUIHelp:
+      m_app.GetMainWindow()->CreateHelpSubwindow();
+      return MenuActionResult::Handled;
+
+    default:
+      break;
+    }
+
+    return MenuActionResult::NotHandled;
+  }
+
+protected:
+  Application &m_app;
+  Debugger &m_debugger;
+};
+
+class StatusBarWindowDelegate : public WindowDelegate {
+public:
+  StatusBarWindowDelegate(Debugger &debugger) : m_debugger(debugger) {
+    FormatEntity::Parse("Thread: ${thread.id%tid}", m_format);
+  }
+
+  ~StatusBarWindowDelegate() override = default;
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    ExecutionContext exe_ctx =
+        m_debugger.GetCommandInterpreter().GetExecutionContext();
+    Process *process = exe_ctx.GetProcessPtr();
+    Thread *thread = exe_ctx.GetThreadPtr();
+    StackFrame *frame = exe_ctx.GetFramePtr();
+    window.Erase();
+    window.SetBackground(2);
+    window.MoveCursor(0, 0);
+    if (process) {
+      const StateType state = process->GetState();
+      window.Printf("Process: %5" PRIu64 " %10s", process->GetID(),
+                    StateAsCString(state));
+
+      if (StateIsStoppedState(state, true)) {
+        StreamString strm;
+        if (thread && FormatEntity::Format(m_format, strm, nullptr, &exe_ctx,
+                                           nullptr, nullptr, false, false)) {
+          window.MoveCursor(40, 0);
+          window.PutCStringTruncated(strm.GetString().str().c_str(), 1);
+        }
+
+        window.MoveCursor(60, 0);
+        if (frame)
+          window.Printf("Frame: %3u  PC = 0x%16.16" PRIx64,
+                        frame->GetFrameIndex(),
+                        frame->GetFrameCodeAddress().GetOpcodeLoadAddress(
+                            exe_ctx.GetTargetPtr()));
+      } else if (state == eStateExited) {
+        const char *exit_desc = process->GetExitDescription();
+        const int exit_status = process->GetExitStatus();
+        if (exit_desc && exit_desc[0])
+          window.Printf(" with status = %i (%s)", exit_status, exit_desc);
+        else
+          window.Printf(" with status = %i", exit_status);
+      }
+    }
+    return true;
+  }
+
+protected:
+  Debugger &m_debugger;
+  FormatEntity::Entry m_format;
+};
+
+class SourceFileWindowDelegate : public WindowDelegate {
+public:
+  SourceFileWindowDelegate(Debugger &debugger)
+      : WindowDelegate(), m_debugger(debugger), m_sc(), m_file_sp(),
+        m_disassembly_scope(nullptr), m_disassembly_sp(), m_disassembly_range(),
+        m_title(), m_line_width(4), m_selected_line(0), m_pc_line(0),
+        m_stop_id(0), m_frame_idx(UINT32_MAX), m_first_visible_line(0),
+        m_min_x(0), m_min_y(0), m_max_x(0), m_max_y(0) {}
+
+  ~SourceFileWindowDelegate() override = default;
+
+  void Update(const SymbolContext &sc) { m_sc = sc; }
+
+  uint32_t NumVisibleLines() const { return m_max_y - m_min_y; }
+
+  const char *WindowDelegateGetHelpText() override {
+    return "Source/Disassembly window keyboard shortcuts:";
+  }
+
+  KeyHelp *WindowDelegateGetKeyHelp() override {
+    static curses::KeyHelp g_source_view_key_help[] = {
+        {KEY_RETURN, "Run to selected line with one shot breakpoint"},
+        {KEY_UP, "Select previous source line"},
+        {KEY_DOWN, "Select next source line"},
+        {KEY_PPAGE, "Page up"},
+        {KEY_NPAGE, "Page down"},
+        {'b', "Set breakpoint on selected source/disassembly line"},
+        {'c', "Continue process"},
+        {'d', "Detach and resume process"},
+        {'D', "Detach with process suspended"},
+        {'h', "Show help dialog"},
+        {'k', "Kill process"},
+        {'n', "Step over (source line)"},
+        {'N', "Step over (single instruction)"},
+        {'o', "Step out"},
+        {'s', "Step in (source line)"},
+        {'S', "Step in (single instruction)"},
+        {',', "Page up"},
+        {'.', "Page down"},
+        {'\0', nullptr}};
+    return g_source_view_key_help;
+  }
+
+  bool WindowDelegateDraw(Window &window, bool force) override {
+    ExecutionContext exe_ctx =
+        m_debugger.GetCommandInterpreter().GetExecutionContext();
+    Process *process = exe_ctx.GetProcessPtr();
+    Thread *thread = nullptr;
+
+    bool update_location = false;
+    if (process) {
+      StateType state = process->GetState();
+      if (StateIsStoppedState(state, true)) {
+        // We are stopped, so it is ok to
+        update_location = true;
+      }
+    }
+
+    m_min_x = 1;
+    m_min_y = 2;
+    m_max_x = window.GetMaxX() - 1;
+    m_max_y = window.GetMaxY() - 1;
+
+    const uint32_t num_visible_lines = NumVisibleLines();
+    StackFrameSP frame_sp;
+    bool set_selected_line_to_pc = false;
+
+    if (update_location) {
+      const bool process_alive = process ? process->IsAlive() : false;
+      bool thread_changed = false;
+      if (process_alive) {
+        thread = exe_ctx.GetThreadPtr();
+        if (thread) {
+          frame_sp = thread->GetSelectedFrame();
+          auto tid = thread->GetID();
+          thread_changed = tid != m_tid;
+          m_tid = tid;
+        } else {
+          if (m_tid != LLDB_INVALID_THREAD_ID) {
+            thread_changed = true;
+            m_tid = LLDB_INVALID_THREAD_ID;
+          }
+        }
+      }
+      const uint32_t stop_id = process ? process->GetStopID() : 0;
+      const bool stop_id_changed = stop_id != m_stop_id;
+      bool frame_changed = false;
+      m_stop_id = stop_id;
+      m_title.Clear();
+      if (frame_sp) {
+        m_sc = frame_sp->GetSymbolContext(eSymbolContextEverything);
+        if (m_sc.module_sp) {
+          m_title.Printf(
+              "%s", m_sc.module_sp->GetFileSpec().GetFilename().GetCString());
+          ConstString func_name = m_sc.GetFunctionName();
+          if (func_name)
+            m_title.Printf("`%s", func_name.GetCString());
+        }
+        const uint32_t frame_idx = frame_sp->GetFrameIndex();
+        frame_changed = frame_idx != m_frame_idx;
+        m_frame_idx = frame_idx;
+      } else {
+        m_sc.Clear(true);
+        frame_changed = m_frame_idx != UINT32_MAX;
+        m_frame_idx = UINT32_MAX;
+      }
+
+      const bool context_changed =
+          thread_changed || frame_changed || stop_id_changed;
+
+      if (process_alive) {
+        if (m_sc.line_entry.IsValid()) {
+          m_pc_line = m_sc.line_entry.line;
+          if (m_pc_line != UINT32_MAX)
+            --m_pc_line; // Convert to zero based line number...
+          // Update the selected line if the stop ID changed...
+          if (context_changed)
+            m_selected_line = m_pc_line;
+
+          if (m_file_sp && m_file_sp->GetFileSpec() == m_sc.line_entry.file) {
+            // Same file, nothing to do, we should either have the lines or not
+            // (source file missing)
+            if (m_selected_line >= static_cast<size_t>(m_first_visible_line)) {
+              if (m_selected_line >= m_first_visible_line + num_visible_lines)
+                m_first_visible_line = m_selected_line - 10;
+            } else {
+              if (m_selected_line > 10)
+                m_first_visible_line = m_selected_line - 10;
+              else
+                m_first_visible_line = 0;
+            }
+          } else {
+            // File changed, set selected line to the line with the PC
+            m_selected_line = m_pc_line;
+            m_file_sp =
+                m_debugger.GetSourceManager().GetFile(m_sc.line_entry.file);
+            if (m_file_sp) {
+              const size_t num_lines = m_file_sp->GetNumLines();
+              m_line_width = 1;
+              for (size_t n = num_lines; n >= 10; n = n / 10)
+                ++m_line_width;
+
+              if (num_lines < num_visible_lines ||
+                  m_selected_line < num_visible_lines)
+                m_first_visible_line = 0;
+              else
+                m_first_visible_line = m_selected_line - 10;
+            }
+          }
+        } else {
+          m_file_sp.reset();
+        }
+
+        if (!m_file_sp || m_file_sp->GetNumLines() == 0) {
+          // Show disassembly
+          bool prefer_file_cache = false;
+          if (m_sc.function) {
+            if (m_disassembly_scope != m_sc.function) {
+              m_disassembly_scope = m_sc.function;
+              m_disassembly_sp = m_sc.function->GetInstructions(
+                  exe_ctx, nullptr, prefer_file_cache);
+              if (m_disassembly_sp) {
+                set_selected_line_to_pc = true;
+                m_disassembly_range = m_sc.function->GetAddressRange();
+              } else {
+                m_disassembly_range.Clear();
+              }
+            } else {
+              set_selected_line_to_pc = context_changed;
+            }
+          } else if (m_sc.symbol) {
+            if (m_disassembly_scope != m_sc.symbol) {
+              m_disassembly_scope = m_sc.symbol;
+              m_disassembly_sp = m_sc.symbol->GetInstructions(
+                  exe_ctx, nullptr, prefer_file_cache);
+              if (m_disassembly_sp) {
+                set_selected_line_to_pc = true;
+                m_disassembly_range.GetBaseAddress() =
+                    m_sc.symbol->GetAddress();
+                m_disassembly_range.SetByteSize(m_sc.symbol->GetByteSize());
+              } else {
+                m_disassembly_range.Clear();
+              }
+            } else {
+              set_selected_line_to_pc = context_changed;
+            }
+          }
+        }
+      } else {
+        m_pc_line = UINT32_MAX;
+      }
+    }
+
+    const int window_width = window.GetWidth();
+    window.Erase();
+    window.DrawTitleBox("Sources");
+    if (!m_title.GetString().empty()) {
+      window.AttributeOn(A_REVERSE);
+      window.MoveCursor(1, 1);
+      window.PutChar(' ');
+      window.PutCStringTruncated(m_title.GetString().str().c_str(), 1);
+      int x = window.GetCursorX();
+      if (x < window_width - 1) {
+        window.Printf("%*s", window_width - x - 1, "");
+      }
+      window.AttributeOff(A_REVERSE);
+    }
+
+    Target *target = exe_ctx.GetTargetPtr();
+    const size_t num_source_lines = GetNumSourceLines();
+    if (num_source_lines > 0) {
+      // Display source
+      BreakpointLines bp_lines;
+      if (target) {
+        BreakpointList &bp_list = target->GetBreakpointList();
+        const size_t num_bps = bp_list.GetSize();
+        for (size_t bp_idx = 0; bp_idx < num_bps; ++bp_idx) {
+          BreakpointSP bp_sp = bp_list.GetBreakpointAtIndex(bp_idx);
+          const size_t num_bps_locs = bp_sp->GetNumLocations();
+          for (size_t bp_loc_idx = 0; bp_loc_idx < num_bps_locs; ++bp_loc_idx) {
+            BreakpointLocationSP bp_loc_sp =
+                bp_sp->GetLocationAtIndex(bp_loc_idx);
+            LineEntry bp_loc_line_entry;
+            if (bp_loc_sp->GetAddress().CalculateSymbolContextLineEntry(
+                    bp_loc_line_entry)) {
+              if (m_file_sp->GetFileSpec() == bp_loc_line_entry.file) {
+                bp_lines.insert(bp_loc_line_entry.line);
+              }
+            }
+          }
+        }
+      }
+
+      const attr_t selected_highlight_attr = A_REVERSE;
+      const attr_t pc_highlight_attr = COLOR_PAIR(1);
+
+      for (size_t i = 0; i < num_visible_lines; ++i) {
+        const uint32_t curr_line = m_first_visible_line + i;
+        if (curr_line < num_source_lines) {
+          const int line_y = m_min_y + i;
+          window.MoveCursor(1, line_y);
+          const bool is_pc_line = curr_line == m_pc_line;
+          const bool line_is_selected = m_selected_line == curr_line;
+          // Highlight the line as the PC line first, then if the selected line
+          // isn't the same as the PC line, highlight it differently
+          attr_t highlight_attr = 0;
+          attr_t bp_attr = 0;
+          if (is_pc_line)
+            highlight_attr = pc_highlight_attr;
+          else if (line_is_selected)
+            highlight_attr = selected_highlight_attr;
+
+          if (bp_lines.find(curr_line + 1) != bp_lines.end())
+            bp_attr = COLOR_PAIR(2);
+
+          if (bp_attr)
+            window.AttributeOn(bp_attr);
+
+          window.Printf(" %*u ", m_line_width, curr_line + 1);
+
+          if (bp_attr)
+            window.AttributeOff(bp_attr);
+
+          window.PutChar(ACS_VLINE);
+          // Mark the line with the PC with a diamond
+          if (is_pc_line)
+            window.PutChar(ACS_DIAMOND);
+          else
+            window.PutChar(' ');
+
+          if (highlight_attr)
+            window.AttributeOn(highlight_attr);
+          const uint32_t line_len =
+              m_file_sp->GetLineLength(curr_line + 1, false);
+          if (line_len > 0)
+            window.PutCString(m_file_sp->PeekLineData(curr_line + 1), line_len);
+
+          if (is_pc_line && frame_sp &&
+              frame_sp->GetConcreteFrameIndex() == 0) {
+            StopInfoSP stop_info_sp;
+            if (thread)
+              stop_info_sp = thread->GetStopInfo();
+            if (stop_info_sp) {
+              const char *stop_description = stop_info_sp->GetDescription();
+              if (stop_description && stop_description[0]) {
+                size_t stop_description_len = strlen(stop_description);
+                int desc_x = window_width - stop_description_len - 16;
+                window.Printf("%*s", desc_x - window.GetCursorX(), "");
+                // window.MoveCursor(window_width - stop_description_len - 15,
+                // line_y);
+                window.Printf("<<< Thread %u: %s ", thread->GetIndexID(),
+                              stop_description);
+              }
+            } else {
+              window.Printf("%*s", window_width - window.GetCursorX() - 1, "");
+            }
+          }
+          if (highlight_attr)
+            window.AttributeOff(highlight_attr);
+        } else {
+          break;
+        }
+      }
+    } else {
+      size_t num_disassembly_lines = GetNumDisassemblyLines();
+      if (num_disassembly_lines > 0) {
+        // Display disassembly
+        BreakpointAddrs bp_file_addrs;
+        Target *target = exe_ctx.GetTargetPtr();
+        if (target) {
+          BreakpointList &bp_list = target->GetBreakpointList();
+          const size_t num_bps = bp_list.GetSize();
+          for (size_t bp_idx = 0; bp_idx < num_bps; ++bp_idx) {
+            BreakpointSP bp_sp = bp_list.GetBreakpointAtIndex(bp_idx);
+            const size_t num_bps_locs = bp_sp->GetNumLocations();
+            for (size_t bp_loc_idx = 0; bp_loc_idx < num_bps_locs;
+                 ++bp_loc_idx) {
+              BreakpointLocationSP bp_loc_sp =
+                  bp_sp->GetLocationAtIndex(bp_loc_idx);
+              LineEntry bp_loc_line_entry;
+              const lldb::addr_t file_addr =
+                  bp_loc_sp->GetAddress().GetFileAddress();
+              if (file_addr != LLDB_INVALID_ADDRESS) {
+                if (m_disassembly_range.ContainsFileAddress(file_addr))
+                  bp_file_addrs.insert(file_addr);
+              }
+            }
+          }
+        }
+
+        const attr_t selected_highlight_attr = A_REVERSE;
+        const attr_t pc_highlight_attr = COLOR_PAIR(1);
+
+        StreamString strm;
+
+        InstructionList &insts = m_disassembly_sp->GetInstructionList();
+        Address pc_address;
+
+        if (frame_sp)
+          pc_address = frame_sp->GetFrameCodeAddress();
+        const uint32_t pc_idx =
+            pc_address.IsValid()
+                ? insts.GetIndexOfInstructionAtAddress(pc_address)
+                : UINT32_MAX;
+        if (set_selected_line_to_pc) {
+          m_selected_line = pc_idx;
+        }
+
+        const uint32_t non_visible_pc_offset = (num_visible_lines / 5);
+        if (static_cast<size_t>(m_first_visible_line) >= num_disassembly_lines)
+          m_first_visible_line = 0;
+
+        if (pc_idx < num_disassembly_lines) {
+          if (pc_idx < static_cast<uint32_t>(m_first_visible_line) ||
+              pc_idx >= m_first_visible_line + num_visible_lines)
+            m_first_visible_line = pc_idx - non_visible_pc_offset;
+        }
+
+        for (size_t i = 0; i < num_visible_lines; ++i) {
+          const uint32_t inst_idx = m_first_visible_line + i;
+          Instruction *inst = insts.GetInstructionAtIndex(inst_idx).get();
+          if (!inst)
+            break;
+
+          const int line_y = m_min_y + i;
+          window.MoveCursor(1, line_y);
+          const bool is_pc_line = frame_sp && inst_idx == pc_idx;
+          const bool line_is_selected = m_selected_line == inst_idx;
+          // Highlight the line as the PC line first, then if the selected line
+          // isn't the same as the PC line, highlight it differently
+          attr_t highlight_attr = 0;
+          attr_t bp_attr = 0;
+          if (is_pc_line)
+            highlight_attr = pc_highlight_attr;
+          else if (line_is_selected)
+            highlight_attr = selected_highlight_attr;
+
+          if (bp_file_addrs.find(inst->GetAddress().GetFileAddress()) !=
+              bp_file_addrs.end())
+            bp_attr = COLOR_PAIR(2);
+
+          if (bp_attr)
+            window.AttributeOn(bp_attr);
+
+          window.Printf(" 0x%16.16llx ",
+                        static_cast<unsigned long long>(
+                            inst->GetAddress().GetLoadAddress(target)));
+
+          if (bp_attr)
+            window.AttributeOff(bp_attr);
+
+          window.PutChar(ACS_VLINE);
+          // Mark the line with the PC with a diamond
+          if (is_pc_line)
+            window.PutChar(ACS_DIAMOND);
+          else
+            window.PutChar(' ');
+
+          if (highlight_attr)
+            window.AttributeOn(highlight_attr);
+
+          const char *mnemonic = inst->GetMnemonic(&exe_ctx);
+          const char *operands = inst->GetOperands(&exe_ctx);
+          const char *comment = inst->GetComment(&exe_ctx);
+
+          if (mnemonic != nullptr && mnemonic[0] == '\0')
+            mnemonic = nullptr;
+          if (operands != nullptr && operands[0] == '\0')
+            operands = nullptr;
+          if (comment != nullptr && comment[0] == '\0')
+            comment = nullptr;
+
+          strm.Clear();
+
+          if (mnemonic != nullptr && operands != nullptr && comment != nullptr)
+            strm.Printf("%-8s %-25s ; %s", mnemonic, operands, comment);
+          else if (mnemonic != nullptr && operands != nullptr)
+            strm.Printf("%-8s %s", mnemonic, operands);
+          else if (mnemonic != nullptr)
+            strm.Printf("%s", mnemonic);
+
+          int right_pad = 1;
+          window.PutCStringTruncated(strm.GetData(), right_pad);
+
+          if (is_pc_line && frame_sp &&
+              frame_sp->GetConcreteFrameIndex() == 0) {
+            StopInfoSP stop_info_sp;
+            if (thread)
+              stop_info_sp = thread->GetStopInfo();
+            if (stop_info_sp) {
+              const char *stop_description = stop_info_sp->GetDescription();
+              if (stop_description && stop_description[0]) {
+                size_t stop_description_len = strlen(stop_description);
+                int desc_x = window_width - stop_description_len - 16;
+                window.Printf("%*s", desc_x - window.GetCursorX(), "");
+                // window.MoveCursor(window_width - stop_description_len - 15,
+                // line_y);
+                window.Printf("<<< Thread %u: %s ", thread->GetIndexID(),
+                              stop_description);
+              }
+            } else {
+              window.Printf("%*s", window_width - window.GetCursorX() - 1, "");
+            }
+          }
+          if (highlight_attr)
+            window.AttributeOff(highlight_attr);
+        }
+      }
+    }
+    return true; // Drawing handled
+  }
+
+  size_t GetNumLines() {
+    size_t num_lines = GetNumSourceLines();
+    if (num_lines == 0)
+      num_lines = GetNumDisassemblyLines();
+    return num_lines;
+  }
+
+  size_t GetNumSourceLines() const {
+    if (m_file_sp)
+      return m_file_sp->GetNumLines();
+    return 0;
+  }
+
+  size_t GetNumDisassemblyLines() const {
+    if (m_disassembly_sp)
+      return m_disassembly_sp->GetInstructionList().GetSize();
+    return 0;
+  }
+
+  HandleCharResult WindowDelegateHandleChar(Window &window, int c) override {
+    const uint32_t num_visible_lines = NumVisibleLines();
+    const size_t num_lines = GetNumLines();
+
+    switch (c) {
+    case ',':
+    case KEY_PPAGE:
+      // Page up key
+      if (static_cast<uint32_t>(m_first_visible_line) > num_visible_lines)
+        m_first_visible_line -= num_visible_lines;
+      else
+        m_first_visible_line = 0;
+      m_selected_line = m_first_visible_line;
+      return eKeyHandled;
+
+    case '.':
+    case KEY_NPAGE:
+      // Page down key
+      {
+        if (m_first_visible_line + num_visible_lines < num_lines)
+          m_first_visible_line += num_visible_lines;
+        else if (num_lines < num_visible_lines)
+          m_first_visible_line = 0;
+        else
+          m_first_visible_line = num_lines - num_visible_lines;
+        m_selected_line = m_first_visible_line;
+      }
+      return eKeyHandled;
+
+    case KEY_UP:
+      if (m_selected_line > 0) {
+        m_selected_line--;
+        if (static_cast<size_t>(m_first_visible_line) > m_selected_line)
+          m_first_visible_line = m_selected_line;
+      }
+      return eKeyHandled;
+
+    case KEY_DOWN:
+      if (m_selected_line + 1 < num_lines) {
+        m_selected_line++;
+        if (m_first_visible_line + num_visible_lines < m_selected_line)
+          m_first_visible_line++;
+      }
+      return eKeyHandled;
+
+    case '\r':
+    case '\n':
+    case KEY_ENTER:
+      // Set a breakpoint and run to the line using a one shot breakpoint
+      if (GetNumSourceLines() > 0) {
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasProcessScope() && exe_ctx.GetProcessRef().IsAlive()) {
+          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
+              nullptr, // Don't limit the breakpoint to certain modules
+              m_file_sp->GetFileSpec(), // Source file
+              m_selected_line +
+                  1, // Source line number (m_selected_line is zero based)
+              0,     // Unspecified column.
+              0,     // No offset
+              eLazyBoolCalculate,  // Check inlines using global setting
+              eLazyBoolCalculate,  // Skip prologue using global setting,
+              false,               // internal
+              false,               // request_hardware
+              eLazyBoolCalculate); // move_to_nearest_code
+          // Make breakpoint one shot
+          bp_sp->GetOptions()->SetOneShot(true);
+          exe_ctx.GetProcessRef().Resume();
+        }
+      } else if (m_selected_line < GetNumDisassemblyLines()) {
+        const Instruction *inst = m_disassembly_sp->GetInstructionList()
+                                      .GetInstructionAtIndex(m_selected_line)
+                                      .get();
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasTargetScope()) {
+          Address addr = inst->GetAddress();
+          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
+              addr,   // lldb_private::Address
+              false,  // internal
+              false); // request_hardware
+          // Make breakpoint one shot
+          bp_sp->GetOptions()->SetOneShot(true);
+          exe_ctx.GetProcessRef().Resume();
+        }
+      }
+      return eKeyHandled;
+
+    case 'b': // 'b' == toggle breakpoint on currently selected line
+      if (m_selected_line < GetNumSourceLines()) {
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasTargetScope()) {
+          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
+              nullptr, // Don't limit the breakpoint to certain modules
+              m_file_sp->GetFileSpec(), // Source file
+              m_selected_line +
+                  1, // Source line number (m_selected_line is zero based)
+              0,     // No column specified.
+              0,     // No offset
+              eLazyBoolCalculate,  // Check inlines using global setting
+              eLazyBoolCalculate,  // Skip prologue using global setting,
+              false,               // internal
+              false,               // request_hardware
+              eLazyBoolCalculate); // move_to_nearest_code
+        }
+      } else if (m_selected_line < GetNumDisassemblyLines()) {
+        const Instruction *inst = m_disassembly_sp->GetInstructionList()
+                                      .GetInstructionAtIndex(m_selected_line)
+                                      .get();
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasTargetScope()) {
+          Address addr = inst->GetAddress();
+          BreakpointSP bp_sp = exe_ctx.GetTargetRef().CreateBreakpoint(
+              addr,   // lldb_private::Address
+              false,  // internal
+              false); // request_hardware
+        }
+      }
+      return eKeyHandled;
+
+    case 'd': // 'd' == detach and let run
+    case 'D': // 'D' == detach and keep stopped
+    {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasProcessScope())
+        exe_ctx.GetProcessRef().Detach(c == 'D');
+    }
+      return eKeyHandled;
+
+    case 'k':
+      // 'k' == kill
+      {
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasProcessScope())
+          exe_ctx.GetProcessRef().Destroy(false);
+      }
+      return eKeyHandled;
+
+    case 'c':
+      // 'c' == continue
+      {
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasProcessScope())
+          exe_ctx.GetProcessRef().Resume();
+      }
+      return eKeyHandled;
+
+    case 'o':
+      // 'o' == step out
+      {
+        ExecutionContext exe_ctx =
+            m_debugger.GetCommandInterpreter().GetExecutionContext();
+        if (exe_ctx.HasThreadScope() &&
+            StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
+          exe_ctx.GetThreadRef().StepOut();
+        }
+      }
+      return eKeyHandled;
+
+    case 'n': // 'n' == step over
+    case 'N': // 'N' == step over instruction
+    {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasThreadScope() &&
+          StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
+        bool source_step = (c == 'n');
+        exe_ctx.GetThreadRef().StepOver(source_step);
+      }
+    }
+      return eKeyHandled;
+
+    case 's': // 's' == step into
+    case 'S': // 'S' == step into instruction
+    {
+      ExecutionContext exe_ctx =
+          m_debugger.GetCommandInterpreter().GetExecutionContext();
+      if (exe_ctx.HasThreadScope() &&
+          StateIsStoppedState(exe_ctx.GetProcessRef().GetState(), true)) {
+        bool source_step = (c == 's');
+        exe_ctx.GetThreadRef().StepIn(source_step);
+      }
+    }
+      return eKeyHandled;
+
+    case 'h':
+      window.CreateHelpSubwindow();
+      return eKeyHandled;
+
+    default:
+      break;
+    }
+    return eKeyNotHandled;
+  }
+
+protected:
+  typedef std::set<uint32_t> BreakpointLines;
+  typedef std::set<lldb::addr_t> BreakpointAddrs;
+
+  Debugger &m_debugger;
+  SymbolContext m_sc;
+  SourceManager::FileSP m_file_sp;
+  SymbolContextScope *m_disassembly_scope;
+  lldb::DisassemblerSP m_disassembly_sp;
+  AddressRange m_disassembly_range;
+  StreamString m_title;
+  lldb::user_id_t m_tid;
+  int m_line_width;
+  uint32_t m_selected_line; // The selected line
+  uint32_t m_pc_line;       // The line with the PC
+  uint32_t m_stop_id;
+  uint32_t m_frame_idx;
+  int m_first_visible_line;
+  int m_min_x;
+  int m_min_y;
+  int m_max_x;
+  int m_max_y;
+};
+
+DisplayOptions ValueObjectListDelegate::g_options = {true};
+
+IOHandlerCursesGUI::IOHandlerCursesGUI(Debugger &debugger)
+    : IOHandler(debugger, IOHandler::Type::Curses) {}
+
+void IOHandlerCursesGUI::Activate() {
+  IOHandler::Activate();
+  if (!m_app_ap) {
+    m_app_ap.reset(new Application(GetInputFILE(), GetOutputFILE()));
+
+    // This is both a window and a menu delegate
+    std::shared_ptr<ApplicationDelegate> app_delegate_sp(
+        new ApplicationDelegate(*m_app_ap, m_debugger));
+
+    MenuDelegateSP app_menu_delegate_sp =
+        std::static_pointer_cast<MenuDelegate>(app_delegate_sp);
+    MenuSP lldb_menu_sp(
+        new Menu("LLDB", "F1", KEY_F(1), ApplicationDelegate::eMenuID_LLDB));
+    MenuSP exit_menuitem_sp(
+        new Menu("Exit", nullptr, 'x', ApplicationDelegate::eMenuID_LLDBExit));
+    exit_menuitem_sp->SetCannedResult(MenuActionResult::Quit);
+    lldb_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "About LLDB", nullptr, 'a', ApplicationDelegate::eMenuID_LLDBAbout)));
+    lldb_menu_sp->AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
+    lldb_menu_sp->AddSubmenu(exit_menuitem_sp);
+
+    MenuSP target_menu_sp(new Menu("Target", "F2", KEY_F(2),
+                                   ApplicationDelegate::eMenuID_Target));
+    target_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Create", nullptr, 'c', ApplicationDelegate::eMenuID_TargetCreate)));
+    target_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Delete", nullptr, 'd', ApplicationDelegate::eMenuID_TargetDelete)));
+
+    MenuSP process_menu_sp(new Menu("Process", "F3", KEY_F(3),
+                                    ApplicationDelegate::eMenuID_Process));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Attach", nullptr, 'a', ApplicationDelegate::eMenuID_ProcessAttach)));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Detach", nullptr, 'd', ApplicationDelegate::eMenuID_ProcessDetach)));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Launch", nullptr, 'l', ApplicationDelegate::eMenuID_ProcessLaunch)));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(Menu::Type::Separator)));
+    process_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Continue", nullptr, 'c',
+                        ApplicationDelegate::eMenuID_ProcessContinue)));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Halt", nullptr, 'h', ApplicationDelegate::eMenuID_ProcessHalt)));
+    process_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Kill", nullptr, 'k', ApplicationDelegate::eMenuID_ProcessKill)));
+
+    MenuSP thread_menu_sp(new Menu("Thread", "F4", KEY_F(4),
+                                   ApplicationDelegate::eMenuID_Thread));
+    thread_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Step In", nullptr, 'i', ApplicationDelegate::eMenuID_ThreadStepIn)));
+    thread_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Step Over", nullptr, 'v',
+                        ApplicationDelegate::eMenuID_ThreadStepOver)));
+    thread_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Step Out", nullptr, 'o', ApplicationDelegate::eMenuID_ThreadStepOut)));
+
+    MenuSP view_menu_sp(
+        new Menu("View", "F5", KEY_F(5), ApplicationDelegate::eMenuID_View));
+    view_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Backtrace", nullptr, 'b',
+                        ApplicationDelegate::eMenuID_ViewBacktrace)));
+    view_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Registers", nullptr, 'r',
+                        ApplicationDelegate::eMenuID_ViewRegisters)));
+    view_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "Source", nullptr, 's', ApplicationDelegate::eMenuID_ViewSource)));
+    view_menu_sp->AddSubmenu(
+        MenuSP(new Menu("Variables", nullptr, 'v',
+                        ApplicationDelegate::eMenuID_ViewVariables)));
+
+    MenuSP help_menu_sp(
+        new Menu("Help", "F6", KEY_F(6), ApplicationDelegate::eMenuID_Help));
+    help_menu_sp->AddSubmenu(MenuSP(new Menu(
+        "GUI Help", nullptr, 'g', ApplicationDelegate::eMenuID_HelpGUIHelp)));
+
+    m_app_ap->Initialize();
+    WindowSP &main_window_sp = m_app_ap->GetMainWindow();
+
+    MenuSP menubar_sp(new Menu(Menu::Type::Bar));
+    menubar_sp->AddSubmenu(lldb_menu_sp);
+    menubar_sp->AddSubmenu(target_menu_sp);
+    menubar_sp->AddSubmenu(process_menu_sp);
+    menubar_sp->AddSubmenu(thread_menu_sp);
+    menubar_sp->AddSubmenu(view_menu_sp);
+    menubar_sp->AddSubmenu(help_menu_sp);
+    menubar_sp->SetDelegate(app_menu_delegate_sp);
+
+    Rect content_bounds = main_window_sp->GetFrame();
+    Rect menubar_bounds = content_bounds.MakeMenuBar();
+    Rect status_bounds = content_bounds.MakeStatusBar();
+    Rect source_bounds;
+    Rect variables_bounds;
+    Rect threads_bounds;
+    Rect source_variables_bounds;
+    content_bounds.VerticalSplitPercentage(0.80, source_variables_bounds,
+                                           threads_bounds);
+    source_variables_bounds.HorizontalSplitPercentage(0.70, source_bounds,
+                                                      variables_bounds);
+
+    WindowSP menubar_window_sp =
+        main_window_sp->CreateSubWindow("Menubar", menubar_bounds, false);
+    // Let the menubar get keys if the active window doesn't handle the keys
+    // that are typed so it can respond to menubar key presses.
+    menubar_window_sp->SetCanBeActive(
+        false); // Don't let the menubar become the active window
+    menubar_window_sp->SetDelegate(menubar_sp);
+
+    WindowSP source_window_sp(
+        main_window_sp->CreateSubWindow("Source", source_bounds, true));
+    WindowSP variables_window_sp(
+        main_window_sp->CreateSubWindow("Variables", variables_bounds, false));
+    WindowSP threads_window_sp(
+        main_window_sp->CreateSubWindow("Threads", threads_bounds, false));
+    WindowSP status_window_sp(
+        main_window_sp->CreateSubWindow("Status", status_bounds, false));
+    status_window_sp->SetCanBeActive(
+        false); // Don't let the status bar become the active window
+    main_window_sp->SetDelegate(
+        std::static_pointer_cast<WindowDelegate>(app_delegate_sp));
+    source_window_sp->SetDelegate(
+        WindowDelegateSP(new SourceFileWindowDelegate(m_debugger)));
+    variables_window_sp->SetDelegate(
+        WindowDelegateSP(new FrameVariablesWindowDelegate(m_debugger)));
+    TreeDelegateSP thread_delegate_sp(new ThreadsTreeDelegate(m_debugger));
+    threads_window_sp->SetDelegate(WindowDelegateSP(
+        new TreeWindowDelegate(m_debugger, thread_delegate_sp)));
+    status_window_sp->SetDelegate(
+        WindowDelegateSP(new StatusBarWindowDelegate(m_debugger)));
+
+    // Show the main help window once the first time the curses GUI is launched
+    static bool g_showed_help = false;
+    if (!g_showed_help) {
+      g_showed_help = true;
+      main_window_sp->CreateHelpSubwindow();
+    }
+
+    init_pair(1, COLOR_WHITE, COLOR_BLUE);
+    init_pair(2, COLOR_BLACK, COLOR_WHITE);
+    init_pair(3, COLOR_MAGENTA, COLOR_WHITE);
+    init_pair(4, COLOR_MAGENTA, COLOR_BLACK);
+    init_pair(5, COLOR_RED, COLOR_BLACK);
+  }
+}
+
+void IOHandlerCursesGUI::Deactivate() { m_app_ap->Terminate(); }
+
+void IOHandlerCursesGUI::Run() {
+  m_app_ap->Run(m_debugger);
+  SetIsDone(true);
+}
+
+IOHandlerCursesGUI::~IOHandlerCursesGUI() = default;
+
+void IOHandlerCursesGUI::Cancel() {}
+
+bool IOHandlerCursesGUI::Interrupt() { return false; }
+
+void IOHandlerCursesGUI::GotEOF() {}
+
+#endif // LLDB_DISABLE_CURSES
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index a14bd3d370a1b..cc4eea674170b 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -613,11 +613,10 @@ void Module::FindCompileUnits(const FileSpec &path,
   const size_t num_compile_units = GetNumCompileUnits();
   SymbolContext sc;
   sc.module_sp = shared_from_this();
-  const bool compare_directory = (bool)path.GetDirectory();
   for (size_t i = 0; i < num_compile_units; ++i) {
     sc.comp_unit = GetCompileUnitAtIndex(i).get();
     if (sc.comp_unit) {
-      if (FileSpec::Equal(*sc.comp_unit, path, compare_directory))
+      if (FileSpec::Match(path, sc.comp_unit->GetPrimaryFile()))
         sc_list.Append(sc);
     }
   }
@@ -1060,34 +1059,35 @@ std::string Module::GetSpecificationDescription() const {
   return spec;
 }
 
-void Module::GetDescription(Stream *s, lldb::DescriptionLevel level) {
+void Module::GetDescription(llvm::raw_ostream &s,
+                            lldb::DescriptionLevel level) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
   if (level >= eDescriptionLevelFull) {
     if (m_arch.IsValid())
-      s->Printf("(%s) ", m_arch.GetArchitectureName());
+      s << llvm::formatv("({0}) ", m_arch.GetArchitectureName());
   }
 
   if (level == eDescriptionLevelBrief) {
     const char *filename = m_file.GetFilename().GetCString();
     if (filename)
-      s->PutCString(filename);
+      s << filename;
   } else {
     char path[PATH_MAX];
     if (m_file.GetPath(path, sizeof(path)))
-      s->PutCString(path);
+      s << path;
   }
 
   const char *object_name = m_object_name.GetCString();
   if (object_name)
-    s->Printf("(%s)", object_name);
+    s << llvm::formatv("({0})", object_name);
 }
 
 void Module::ReportError(const char *format, ...) {
   if (format && format[0]) {
     StreamString strm;
     strm.PutCString("error: ");
-    GetDescription(&strm, lldb::eDescriptionLevelBrief);
+    GetDescription(strm.AsRawOstream(), lldb::eDescriptionLevelBrief);
     strm.PutChar(' ');
     va_list args;
     va_start(args, format);
@@ -1118,7 +1118,7 @@ void Module::ReportErrorIfModifyDetected(const char *format, ...) {
       if (format) {
         StreamString strm;
         strm.PutCString("error: the object file ");
-        GetDescription(&strm, lldb::eDescriptionLevelFull);
+        GetDescription(strm.AsRawOstream(), lldb::eDescriptionLevelFull);
         strm.PutCString(" has been modified\n");
 
         va_list args;
@@ -1144,7 +1144,7 @@ void Module::ReportWarning(const char *format, ...) {
   if (format && format[0]) {
     StreamString strm;
     strm.PutCString("warning: ");
-    GetDescription(&strm, lldb::eDescriptionLevelFull);
+    GetDescription(strm.AsRawOstream(), lldb::eDescriptionLevelFull);
     strm.PutChar(' ');
 
     va_list args;
@@ -1165,7 +1165,7 @@ void Module::ReportWarning(const char *format, ...) {
 void Module::LogMessage(Log *log, const char *format, ...) {
   if (log != nullptr) {
     StreamString log_message;
-    GetDescription(&log_message, lldb::eDescriptionLevelFull);
+    GetDescription(log_message.AsRawOstream(), lldb::eDescriptionLevelFull);
     log_message.PutCString(": ");
     va_list args;
     va_start(args, format);
@@ -1178,7 +1178,7 @@ void Module::LogMessage(Log *log, const char *format, ...) {
 void Module::LogMessageVerboseBacktrace(Log *log, const char *format, ...) {
   if (log != nullptr) {
     StreamString log_message;
-    GetDescription(&log_message, lldb::eDescriptionLevelFull);
+    GetDescription(log_message.AsRawOstream(), lldb::eDescriptionLevelFull);
     log_message.PutCString(": ");
     va_list args;
     va_start(args, format);
@@ -1559,19 +1559,13 @@ bool Module::MatchesModuleSpec(const ModuleSpec &module_ref) {
   }
 
   const FileSpec &file_spec = module_ref.GetFileSpec();
-  if (file_spec) {
-    if (!FileSpec::Equal(file_spec, m_file, (bool)file_spec.GetDirectory()) &&
-        !FileSpec::Equal(file_spec, m_platform_file,
-                         (bool)file_spec.GetDirectory()))
-      return false;
-  }
+  if (!FileSpec::Match(file_spec, m_file) &&
+      !FileSpec::Match(file_spec, m_platform_file))
+    return false;
 
   const FileSpec &platform_file_spec = module_ref.GetPlatformFileSpec();
-  if (platform_file_spec) {
-    if (!FileSpec::Equal(platform_file_spec, GetPlatformFileSpec(),
-                         (bool)platform_file_spec.GetDirectory()))
-      return false;
-  }
+  if (!FileSpec::Match(platform_file_spec, GetPlatformFileSpec()))
+    return false;
 
   const ArchSpec &arch = module_ref.GetArchitecture();
   if (arch.IsValid()) {
diff --git a/lldb/source/Core/SearchFilter.cpp b/lldb/source/Core/SearchFilter.cpp
index 8f80caa3eb4de..077aa89674253 100644
--- a/lldb/source/Core/SearchFilter.cpp
+++ b/lldb/source/Core/SearchFilter.cpp
@@ -403,13 +403,11 @@ SearchFilterByModule::~SearchFilterByModule() = default;
 
 bool SearchFilterByModule::ModulePasses(const ModuleSP &module_sp) {
   return (module_sp &&
-          FileSpec::Equal(module_sp->GetFileSpec(), m_module_spec, false));
+          FileSpec::Match(m_module_spec, module_sp->GetFileSpec()));
 }
 
 bool SearchFilterByModule::ModulePasses(const FileSpec &spec) {
-  // Do a full match only if "spec" has a directory
-  const bool full_match = (bool)spec.GetDirectory();
-  return FileSpec::Equal(spec, m_module_spec, full_match);
+  return FileSpec::Match(m_module_spec, spec);
 }
 
 bool SearchFilterByModule::AddressPasses(Address &address) {
@@ -443,8 +441,7 @@ void SearchFilterByModule::Search(Searcher &searcher) {
   const size_t num_modules = target_modules.GetSize();
   for (size_t i = 0; i < num_modules; i++) {
     Module *module = target_modules.GetModulePointerAtIndexUnlocked(i);
-    const bool full_match = (bool)m_module_spec.GetDirectory();
-    if (FileSpec::Equal(m_module_spec, module->GetFileSpec(), full_match)) {
+    if (FileSpec::Match(m_module_spec, module->GetFileSpec())) {
       SymbolContext matchingContext(m_target_sp, module->shared_from_this());
       Searcher::CallbackReturn shouldContinue;
 
@@ -726,8 +723,11 @@ bool SearchFilterByModuleListAndCU::AddressPasses(Address &address) {
     if (m_cu_spec_list.GetSize() != 0)
       return false; // Has no comp_unit so can't pass the file check.
   }
-  if (m_cu_spec_list.FindFileIndex(0, sym_ctx.comp_unit, false) == UINT32_MAX)
-        return false; // Fails the file check
+  FileSpec cu_spec;
+  if (sym_ctx.comp_unit)
+    cu_spec = sym_ctx.comp_unit->GetPrimaryFile();
+  if (m_cu_spec_list.FindFileIndex(0, cu_spec, false) == UINT32_MAX)
+    return false; // Fails the file check
   return SearchFilterByModuleList::ModulePasses(sym_ctx.module_sp); 
 }
 
@@ -736,8 +736,8 @@ bool SearchFilterByModuleListAndCU::CompUnitPasses(FileSpec &fileSpec) {
 }
 
 bool SearchFilterByModuleListAndCU::CompUnitPasses(CompileUnit &compUnit) {
-  bool in_cu_list =
-      m_cu_spec_list.FindFileIndex(0, compUnit, false) != UINT32_MAX;
+  bool in_cu_list = m_cu_spec_list.FindFileIndex(0, compUnit.GetPrimaryFile(),
+                                                 false) != UINT32_MAX;
   if (in_cu_list) {
     ModuleSP module_sp(compUnit.GetModule());
     if (module_sp) {
@@ -787,8 +787,9 @@ void SearchFilterByModuleListAndCU::Search(Searcher &searcher) {
           CompUnitSP cu_sp = module_sp->GetCompileUnitAtIndex(cu_idx);
           matchingContext.comp_unit = cu_sp.get();
           if (matchingContext.comp_unit) {
-            if (m_cu_spec_list.FindFileIndex(0, *matchingContext.comp_unit,
-                                             false) != UINT32_MAX) {
+            if (m_cu_spec_list.FindFileIndex(
+                    0, matchingContext.comp_unit->GetPrimaryFile(), false) !=
+                UINT32_MAX) {
               shouldContinue =
                   DoCUIteration(module_sp, matchingContext, searcher);
               if (shouldContinue == Searcher::eCallbackReturnStop)
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 7615dc1d65c7f..e8fcca4603dfb 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -94,6 +94,8 @@ const char *Section::GetTypeAsCString() const {
     return "dwarf-ranges";
   case eSectionTypeDWARFDebugRngLists:
     return "dwarf-rnglists";
+  case eSectionTypeDWARFDebugRngListsDwo:
+    return "dwarf-rnglists-dwo";
   case eSectionTypeDWARFDebugStr:
     return "dwarf-str";
   case eSectionTypeDWARFDebugStrDwo:
diff --git a/lldb/source/Core/SourceManager.cpp b/lldb/source/Core/SourceManager.cpp
index 42741e4ba4fe4..8e0cc57f80c11 100644
--- a/lldb/source/Core/SourceManager.cpp
+++ b/lldb/source/Core/SourceManager.cpp
@@ -64,7 +64,8 @@ SourceManager::~SourceManager() {}
 
 SourceManager::FileSP SourceManager::GetFile(const FileSpec &file_spec) {
   bool same_as_previous =
-      m_last_file_sp && m_last_file_sp->FileSpecMatches(file_spec);
+      m_last_file_sp &&
+      FileSpec::Match(file_spec, m_last_file_sp->GetFileSpec());
 
   DebuggerSP debugger_sp(m_debugger_wp.lock());
   FileSP file_sp;
@@ -399,24 +400,25 @@ void SourceManager::File::CommonInitializer(const FileSpec &file_spec,
         if (num_matches != 0) {
           if (num_matches > 1) {
             SymbolContext sc;
-            FileSpec *test_cu_spec = nullptr;
+            CompileUnit *test_cu = nullptr;
 
             for (unsigned i = 0; i < num_matches; i++) {
               sc_list.GetContextAtIndex(i, sc);
               if (sc.comp_unit) {
-                if (test_cu_spec) {
-                  if (test_cu_spec != static_cast<FileSpec *>(sc.comp_unit))
+                if (test_cu) {
+                  if (test_cu != sc.comp_unit)
                     got_multiple = true;
                   break;
                 } else
-                  test_cu_spec = sc.comp_unit;
+                  test_cu = sc.comp_unit;
               }
             }
           }
           if (!got_multiple) {
             SymbolContext sc;
             sc_list.GetContextAtIndex(0, sc);
-            m_file_spec = sc.comp_unit;
+            if (sc.comp_unit)
+              m_file_spec = sc.comp_unit->GetPrimaryFile();
             m_mod_time = FileSystem::Instance().GetModificationTime(m_file_spec);
           }
         }
@@ -601,10 +603,6 @@ void SourceManager::File::FindLinesMatchingRegex(
   }
 }
 
-bool SourceManager::File::FileSpecMatches(const FileSpec &file_spec) {
-  return FileSpec::Equal(m_file_spec, file_spec, false);
-}
-
 bool lldb_private::operator==(const SourceManager::File &lhs,
                               const SourceManager::File &rhs) {
   if (lhs.m_file_spec != rhs.m_file_spec)
diff --git a/lldb/source/Core/ValueObjectSyntheticFilter.cpp b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
index a6bf35eac70a2..a30be1b083384 100644
--- a/lldb/source/Core/ValueObjectSyntheticFilter.cpp
+++ b/lldb/source/Core/ValueObjectSyntheticFilter.cpp
@@ -48,8 +48,9 @@ class DummySyntheticFrontEnd : public SyntheticChildrenFrontEnd {
 ValueObjectSynthetic::ValueObjectSynthetic(ValueObject &parent,
                                            lldb::SyntheticChildrenSP filter)
     : ValueObject(parent), m_synth_sp(filter), m_children_byindex(),
-      m_name_toindex(), m_synthetic_children_count(UINT32_MAX),
-      m_synthetic_children_cache(), m_parent_type_name(parent.GetTypeName()),
+      m_name_toindex(), m_synthetic_children_cache(),
+      m_synthetic_children_count(UINT32_MAX),
+      m_parent_type_name(parent.GetTypeName()),
       m_might_have_children(eLazyBoolCalculate),
       m_provides_value(eLazyBoolCalculate) {
   SetName(parent.GetName());
@@ -177,14 +178,20 @@ bool ValueObjectSynthetic::UpdateValue() {
               "filter said caches are stale - clearing",
               GetName().AsCString());
     // filter said that cached values are stale
-    m_children_byindex.Clear();
-    m_name_toindex.Clear();
+    {
+      std::lock_guard<std::mutex> guard(m_child_mutex);
+      m_children_byindex.clear();
+      m_name_toindex.clear();
+    }
     // usually, an object's value can change but this does not alter its
     // children count for a synthetic VO that might indeed happen, so we need
     // to tell the upper echelons that they need to come back to us asking for
     // children
     m_children_count_valid = false;
-    m_synthetic_children_cache.Clear();
+    {
+      std::lock_guard<std::mutex> guard(m_child_mutex);
+      m_synthetic_children_cache.clear();
+    }
     m_synthetic_children_count = UINT32_MAX;
     m_might_have_children = eLazyBoolCalculate;
   } else {
@@ -232,7 +239,16 @@ lldb::ValueObjectSP ValueObjectSynthetic::GetChildAtIndex(size_t idx,
   UpdateValueIfNeeded();
 
   ValueObject *valobj;
-  if (!m_children_byindex.GetValueForKey(idx, valobj)) {
+  bool child_is_cached;
+  {
+    std::lock_guard<std::mutex> guard(m_child_mutex);
+    auto cached_child_it = m_children_byindex.find(idx);
+    child_is_cached = cached_child_it != m_children_byindex.end();
+    if (child_is_cached)
+      valobj = cached_child_it->second;
+  }
+
+  if (!child_is_cached) {
     if (can_create && m_synth_filter_up != nullptr) {
       LLDB_LOGF(log,
                 "[ValueObjectSynthetic::GetChildAtIndex] name=%s, child at "
@@ -254,9 +270,12 @@ lldb::ValueObjectSP ValueObjectSynthetic::GetChildAtIndex(size_t idx,
       if (!synth_guy)
         return synth_guy;
 
-      if (synth_guy->IsSyntheticChildrenGenerated())
-        m_synthetic_children_cache.AppendObject(synth_guy);
-      m_children_byindex.SetValueForKey(idx, synth_guy.get());
+      {
+        std::lock_guard<std::mutex> guard(m_child_mutex);
+        if (synth_guy->IsSyntheticChildrenGenerated())
+          m_synthetic_children_cache.push_back(synth_guy);
+        m_children_byindex[idx] = synth_guy.get();
+      }
       synth_guy->SetPreferredDisplayLanguageIfNeeded(
           GetPreferredDisplayLanguage());
       return synth_guy;
@@ -297,13 +316,21 @@ size_t ValueObjectSynthetic::GetIndexOfChildWithName(ConstString name) {
   UpdateValueIfNeeded();
 
   uint32_t found_index = UINT32_MAX;
-  bool did_find = m_name_toindex.GetValueForKey(name.GetCString(), found_index);
+  bool did_find;
+  {
+    std::lock_guard<std::mutex> guard(m_child_mutex);
+    auto name_to_index = m_name_toindex.find(name.GetCString());
+    did_find = name_to_index != m_name_toindex.end();
+    if (did_find)
+      found_index = name_to_index->second;
+  }
 
   if (!did_find && m_synth_filter_up != nullptr) {
     uint32_t index = m_synth_filter_up->GetIndexOfChildWithName(name);
     if (index == UINT32_MAX)
       return index;
-    m_name_toindex.SetValueForKey(name.GetCString(), index);
+    std::lock_guard<std::mutex> guard(m_child_mutex);
+    m_name_toindex[name.GetCString()] = index;
     return index;
   } else if (!did_find && m_synth_filter_up == nullptr)
     return UINT32_MAX;
diff --git a/lldb/source/DataFormatters/DataVisualization.cpp b/lldb/source/DataFormatters/DataVisualization.cpp
index 08b3b34447bba..e73d44f60f03f 100644
--- a/lldb/source/DataFormatters/DataVisualization.cpp
+++ b/lldb/source/DataFormatters/DataVisualization.cpp
@@ -122,8 +122,7 @@ void DataVisualization::Categories::Enable(ConstString category,
                                            TypeCategoryMap::Position pos) {
   if (GetFormatManager().GetCategory(category)->IsEnabled())
     GetFormatManager().DisableCategory(category);
-  GetFormatManager().EnableCategory(
-      category, pos, std::initializer_list<lldb::LanguageType>());
+  GetFormatManager().EnableCategory(category, pos, {});
 }
 
 void DataVisualization::Categories::Enable(lldb::LanguageType lang_type) {
diff --git a/lldb/source/DataFormatters/TypeCategory.cpp b/lldb/source/DataFormatters/TypeCategory.cpp
index fed2dfb3c7c5b..be3b31603eac4 100644
--- a/lldb/source/DataFormatters/TypeCategory.cpp
+++ b/lldb/source/DataFormatters/TypeCategory.cpp
@@ -13,18 +13,14 @@
 using namespace lldb;
 using namespace lldb_private;
 
-TypeCategoryImpl::TypeCategoryImpl(
-    IFormatChangeListener *clist, ConstString name,
-    std::initializer_list<lldb::LanguageType> langs)
+TypeCategoryImpl::TypeCategoryImpl(IFormatChangeListener *clist,
+                                   ConstString name)
     : m_format_cont("format", "regex-format", clist),
       m_summary_cont("summary", "regex-summary", clist),
       m_filter_cont("filter", "regex-filter", clist),
       m_synth_cont("synth", "regex-synth", clist),
       m_validator_cont("validator", "regex-validator", clist), m_enabled(false),
-      m_change_listener(clist), m_mutex(), m_name(name), m_languages() {
-  for (const lldb::LanguageType lang : langs)
-    AddLanguage(lang);
-}
+      m_change_listener(clist), m_mutex(), m_name(name), m_languages() {}
 
 static bool IsApplicable(lldb::LanguageType category_lang,
                          lldb::LanguageType valobj_lang) {
@@ -90,12 +86,6 @@ void TypeCategoryImpl::AddLanguage(lldb::LanguageType lang) {
   m_languages.push_back(lang);
 }
 
-bool TypeCategoryImpl::HasLanguage(lldb::LanguageType lang) {
-  const auto iter = std::find(m_languages.begin(), m_languages.end(), lang),
-             end = m_languages.end();
-  return (iter != end);
-}
-
 bool TypeCategoryImpl::Get(ValueObject &valobj,
                            const FormattersMatchVector &candidates,
                            lldb::TypeFormatImplSP &entry, uint32_t *reason) {
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index a063da0f4e401..8947500959cbd 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -146,7 +146,7 @@ void DWARFExpression::GetDescription(Stream *s, lldb::DescriptionLevel level,
           // We have a new base address
           if (count > 0)
             s->PutCString(", ");
-          *s << "base_addr = " << end_addr_offset;
+          s->Format("base_addr = {0:x}", end_addr_offset);
         }
       }
 
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index 3ae837866faf1..b29c218f0369b 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -97,6 +97,33 @@ bool IsOnlySpaces(const EditLineStringType &content) {
   return true;
 }
 
+static int GetOperation(HistoryOperation op) {
+  // The naming used by editline for the history operations is counter
+  // intuitive to how it's used here.
+  //
+  //  - The H_PREV operation returns the previous element in the history, which
+  //    is newer than the current one.
+  //
+  //  - The H_NEXT operation returns the next element in the history, which is
+  //    older than the current one.
+  //
+  // The naming of the enum entries match the semantic meaning.
+  switch(op) {
+    case HistoryOperation::Oldest:
+      return H_FIRST;
+    case HistoryOperation::Older:
+      return H_NEXT;
+    case HistoryOperation::Current:
+      return H_CURR;
+    case HistoryOperation::Newer:
+      return H_PREV;
+    case HistoryOperation::Newest:
+      return H_LAST;
+  }
+  llvm_unreachable("Fully covered switch!");
+}
+
+
 EditLineStringType CombineLines(const std::vector<EditLineStringType> &lines) {
   EditLineStringStreamType combined_stream;
   for (EditLineStringType line : lines) {
@@ -423,7 +450,8 @@ StringList Editline::GetInputAsStringList(int line_count) {
   return lines;
 }
 
-unsigned char Editline::RecallHistory(bool earlier) {
+unsigned char Editline::RecallHistory(HistoryOperation op) {
+  assert(op == HistoryOperation::Older || op == HistoryOperation::Newer);
   if (!m_history_sp || !m_history_sp->IsValid())
     return CC_ERROR;
 
@@ -433,27 +461,38 @@ unsigned char Editline::RecallHistory(bool earlier) {
 
   // Treat moving from the "live" entry differently
   if (!m_in_history) {
-    if (!earlier)
+    switch (op) {
+    case HistoryOperation::Newer:
       return CC_ERROR; // Can't go newer than the "live" entry
-    if (history_w(pHistory, &history_event, H_FIRST) == -1)
-      return CC_ERROR;
-
-    // Save any edits to the "live" entry in case we return by moving forward
-    // in history (it would be more bash-like to save over any current entry,
-    // but libedit doesn't offer the ability to add entries anywhere except the
-    // end.)
-    SaveEditedLine();
-    m_live_history_lines = m_input_lines;
-    m_in_history = true;
+    case HistoryOperation::Older: {
+      if (history_w(pHistory, &history_event,
+                    GetOperation(HistoryOperation::Newest)) == -1)
+        return CC_ERROR;
+      // Save any edits to the "live" entry in case we return by moving forward
+      // in history (it would be more bash-like to save over any current entry,
+      // but libedit doesn't offer the ability to add entries anywhere except
+      // the end.)
+      SaveEditedLine();
+      m_live_history_lines = m_input_lines;
+      m_in_history = true;
+    } break;
+    default:
+      llvm_unreachable("unsupported history direction");
+    }
   } else {
-    if (history_w(pHistory, &history_event, earlier ? H_PREV : H_NEXT) == -1) {
-      // Can't move earlier than the earliest entry
-      if (earlier)
+    if (history_w(pHistory, &history_event, GetOperation(op)) == -1) {
+      switch (op) {
+      case HistoryOperation::Older:
+        // Can't move earlier than the earliest entry.
         return CC_ERROR;
-
-      // ... but moving to newer than the newest yields the "live" entry
-      new_input_lines = m_live_history_lines;
-      m_in_history = false;
+      case HistoryOperation::Newer:
+        // Moving to newer-than-the-newest entry yields the "live" entry.
+        new_input_lines = m_live_history_lines;
+        m_in_history = false;
+        break;
+      default:
+        llvm_unreachable("unsupported history direction");
+      }
     }
   }
 
@@ -468,8 +507,17 @@ unsigned char Editline::RecallHistory(bool earlier) {
 
   // Prepare to edit the last line when moving to previous entry, or the first
   // line when moving to next entry
-  SetCurrentLine(m_current_line_index =
-                     earlier ? (int)m_input_lines.size() - 1 : 0);
+  switch (op) {
+  case HistoryOperation::Older:
+    m_current_line_index = (int)m_input_lines.size() - 1;
+    break;
+  case HistoryOperation::Newer:
+    m_current_line_index = 0;
+    break;
+  default:
+    llvm_unreachable("unsupported history direction");
+  }
+  SetCurrentLine(m_current_line_index);
   MoveCursor(CursorLocation::BlockEnd, CursorLocation::EditingPrompt);
   return CC_NEWLINE;
 }
@@ -721,7 +769,7 @@ unsigned char Editline::PreviousLineCommand(int ch) {
   SaveEditedLine();
 
   if (m_current_line_index == 0) {
-    return RecallHistory(true);
+    return RecallHistory(HistoryOperation::Older);
   }
 
   // Start from a known location
@@ -747,7 +795,7 @@ unsigned char Editline::NextLineCommand(int ch) {
     // Don't add an extra line if the existing last line is blank, move through
     // history instead
     if (IsOnlySpaces()) {
-      return RecallHistory(false);
+      return RecallHistory(HistoryOperation::Newer);
     }
 
     // Determine indentation for the new line
@@ -779,13 +827,13 @@ unsigned char Editline::NextLineCommand(int ch) {
 unsigned char Editline::PreviousHistoryCommand(int ch) {
   SaveEditedLine();
 
-  return RecallHistory(true);
+  return RecallHistory(HistoryOperation::Older);
 }
 
 unsigned char Editline::NextHistoryCommand(int ch) {
   SaveEditedLine();
 
-  return RecallHistory(false);
+  return RecallHistory(HistoryOperation::Newer);
 }
 
 unsigned char Editline::FixIndentationCommand(int ch) {
diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm
index 8c7393739bc68..03880ff433bd6 100644
--- a/lldb/source/Host/macosx/objcxx/Host.mm
+++ b/lldb/source/Host/macosx/objcxx/Host.mm
@@ -1130,7 +1130,7 @@ static Status LaunchProcessPosixSpawn(const char *exe_path,
   // --arch <ARCH> as part of the shell invocation
   // to do that job on OSX.
 
-  if (launch_info.GetShell() == nullptr) {
+  if (launch_info.GetShell() == FileSpec()) {
     // We don't need to do this for ARM, and we really shouldn't now that we
     // have multiple CPU subtypes and no posix_spawnattr call that allows us
     // to set which CPU subtype to launch...
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index e022481484135..5a4e466144a6f 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -362,10 +362,23 @@ void CommandInterpreter::Initialize() {
                   "controlled by the type's author.");
       po->SetHelpLong("");
     }
-    AddAlias("parray", cmd_obj_sp, "--element-count %1 --")->SetHelpLong("");
-    AddAlias("poarray", cmd_obj_sp,
-             "--object-description --element-count %1 --")
-        ->SetHelpLong("");
+    CommandAlias *parray_alias = AddAlias("parray", cmd_obj_sp, 
+            "--element-count %1 --");
+    if (parray_alias) {
+        parray_alias->SetHelp
+          ("parray <COUNT> <EXPRESSION> -- lldb will evaluate EXPRESSION "
+           "to get a typed-pointer-to-an-array in memory, and will display "
+           "COUNT elements of that type from the array.");
+        parray_alias->SetHelpLong("");
+    }
+    CommandAlias *poarray_alias = AddAlias("poarray", cmd_obj_sp,
+             "--object-description --element-count %1 --");
+    if (poarray_alias) {
+      poarray_alias->SetHelp("poarray <COUNT> <EXPRESSION> -- lldb will "
+          "evaluate EXPRESSION to get the address of an array of COUNT "
+          "objects in memory, and will call po on them.");
+      poarray_alias->SetHelpLong("");
+    }
   }
 
   cmd_obj_sp = GetCommandSPExact("process kill", false);
diff --git a/lldb/source/Interpreter/OptionGroupPythonClassWithDict.cpp b/lldb/source/Interpreter/OptionGroupPythonClassWithDict.cpp
index 20a7ed1f76ca3..e41f9d7b40ee5 100644
--- a/lldb/source/Interpreter/OptionGroupPythonClassWithDict.cpp
+++ b/lldb/source/Interpreter/OptionGroupPythonClassWithDict.cpp
@@ -127,6 +127,7 @@ void OptionGroupPythonClassWithDict::OptionParsingStarting(
   // the user didn't pass any -k -v pairs.  We want to be able to warn if these
   // were passed when the function they passed won't use them.
   m_dict_sp.reset();
+  m_name.clear();
 }
 
 Status OptionGroupPythonClassWithDict::OptionParsingFinished(
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
index 654585cb35eba..fb8b48cc108be 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
@@ -734,7 +734,7 @@ bool DynamicLoaderMacOSXDYLD::InitializeFromAllImageInfos() {
       if (!module_sp->IsLoadedInTarget(&target)) {
         if (log) {
           StreamString s;
-          module_sp->GetDescription(&s);
+          module_sp->GetDescription(s.AsRawOstream());
           LLDB_LOGF(log, "Unloading pre-run module: %s.", s.GetData());
         }
         not_loaded_modules.Append(module_sp);
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
index 7440f6a0c3636..51540902e2dcc 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.cpp
@@ -57,10 +57,11 @@ ClangASTSource::ClangASTSource(const lldb::TargetSP &target)
   }
 }
 
-void ClangASTSource::InstallASTContext(clang::ASTContext &ast_context,
+void ClangASTSource::InstallASTContext(ClangASTContext &clang_ast_context,
                                        clang::FileManager &file_manager,
                                        bool is_shared_context) {
-  m_ast_context = &ast_context;
+  m_ast_context = clang_ast_context.getASTContext();
+  m_clang_ast_context = &clang_ast_context;
   m_file_manager = &file_manager;
   if (m_target->GetUseModernTypeLookup()) {
     // Configure the ExternalASTMerger.  The merger needs to be able to import
@@ -69,7 +70,7 @@ void ClangASTSource::InstallASTContext(clang::ASTContext &ast_context,
     // AST contexts.
 
     lldbassert(!m_merger_up);
-    clang::ExternalASTMerger::ImporterTarget target = {ast_context,
+    clang::ExternalASTMerger::ImporterTarget target = {*m_ast_context,
                                                        file_manager};
     std::vector<clang::ExternalASTMerger::ImporterSource> sources;
     for (lldb::ModuleSP module_sp : m_target->GetImages().Modules()) {
@@ -132,7 +133,7 @@ void ClangASTSource::InstallASTContext(clang::ASTContext &ast_context,
     m_merger_up =
         std::make_unique<clang::ExternalASTMerger>(target, sources);
   } else {
-    m_ast_importer_sp->InstallMapCompleter(&ast_context, *this);
+    m_ast_importer_sp->InstallMapCompleter(m_ast_context, *this);
   }
 }
 
@@ -363,7 +364,6 @@ void ClangASTSource::CompleteType(TagDecl *tag_decl) {
       TypeList types;
 
       ConstString name(tag_decl->getName().str().c_str());
-      CompilerDeclContext namespace_decl;
 
       const ModuleList &module_list = m_target->GetImages();
 
@@ -776,7 +776,7 @@ void ClangASTSource::FindExternalVisibleDecls(NameSearchContext &context) {
 }
 
 clang::Sema *ClangASTSource::getSema() {
-  return ClangASTContext::GetASTContext(m_ast_context)->getSema();
+  return m_clang_ast_context->getSema();
 }
 
 bool ClangASTSource::IgnoreName(const ConstString name,
@@ -2059,8 +2059,7 @@ CompilerType ClangASTSource::GuardedCopyType(const CompilerType &src_type) {
     // seems to be generating bad types on occasion.
     return CompilerType();
 
-  return CompilerType(ClangASTContext::GetASTContext(m_ast_context),
-                      copied_qual_type.getAsOpaquePtr());
+  return CompilerType(m_clang_ast_context, copied_qual_type.getAsOpaquePtr());
 }
 
 clang::NamedDecl *NameSearchContext::AddVarDecl(const CompilerType &type) {
@@ -2187,10 +2186,9 @@ clang::NamedDecl *NameSearchContext::AddGenericFunDecl() {
       ArrayRef<QualType>(),                     // argument types
       proto_info));
 
-  return AddFunDecl(
-      CompilerType(ClangASTContext::GetASTContext(m_ast_source.m_ast_context),
-                   generic_function_type.getAsOpaquePtr()),
-      true);
+  return AddFunDecl(CompilerType(m_ast_source.m_clang_ast_context,
+                                 generic_function_type.getAsOpaquePtr()),
+                    true);
 }
 
 clang::NamedDecl *
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
index d8e784f49b10e..194233e4a028e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTSource.h
@@ -57,7 +57,7 @@ class ClangASTSource : public ClangExternalASTSourceCommon,
   }
   void MaterializeVisibleDecls(const clang::DeclContext *DC) { return; }
 
-  void InstallASTContext(clang::ASTContext &ast_context,
+  void InstallASTContext(ClangASTContext &ast_context,
                          clang::FileManager &file_manager,
                          bool is_shared_context = false);
 
@@ -408,6 +408,8 @@ class ClangASTSource : public ClangExternalASTSourceCommon,
   const lldb::TargetSP m_target;
   /// The AST context requests are coming in for.
   clang::ASTContext *m_ast_context;
+  /// The ClangASTContext for m_ast_context.
+  ClangASTContext *m_clang_ast_context;
   /// The file manager paired with the AST context.
   clang::FileManager *m_file_manager;
   /// The target's AST importer.
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
index 4966ac1640feb..fc25a2e72e3b1 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.cpp
@@ -1076,12 +1076,9 @@ void ClangExpressionDeclMap::LookupLocalVarNamespace(
   if (!frame_ast)
     return;
 
-  ClangASTContext *map_ast = ClangASTContext::GetASTContext(m_ast_context);
-  if (!map_ast)
-    return;
-
-  clang::NamespaceDecl *namespace_decl = map_ast->GetUniqueNamespaceDeclaration(
-      g_lldb_local_vars_namespace_cstr, nullptr);
+  clang::NamespaceDecl *namespace_decl =
+      m_clang_ast_context->GetUniqueNamespaceDeclaration(
+          g_lldb_local_vars_namespace_cstr, nullptr);
   if (!namespace_decl)
     return;
 
@@ -1180,6 +1177,104 @@ bool ClangExpressionDeclMap::LookupLocalVariable(
   return variable_found;
 }
 
+/// Structure to hold the info needed when comparing function
+/// declarations.
+namespace {
+struct FuncDeclInfo {
+  ConstString m_name;
+  CompilerType m_copied_type;
+  uint32_t m_decl_lvl;
+  SymbolContext m_sym_ctx;
+};
+} // namespace
+
+SymbolContextList ClangExpressionDeclMap::SearchFunctionsInSymbolContexts(
+    const SymbolContextList &sc_list,
+    const CompilerDeclContext &frame_decl_context) {
+  // First, symplify things by looping through the symbol contexts to
+  // remove unwanted functions and separate out the functions we want to
+  // compare and prune into a separate list. Cache the info needed about
+  // the function declarations in a vector for efficiency.
+  uint32_t num_indices = sc_list.GetSize();
+  SymbolContextList sc_sym_list;
+  std::vector<FuncDeclInfo> decl_infos;
+  decl_infos.reserve(num_indices);
+  clang::DeclContext *frame_decl_ctx =
+      (clang::DeclContext *)frame_decl_context.GetOpaqueDeclContext();
+  ClangASTContext *ast = llvm::dyn_cast_or_null<ClangASTContext>(
+      frame_decl_context.GetTypeSystem());
+
+  for (uint32_t index = 0; index < num_indices; ++index) {
+    FuncDeclInfo fdi;
+    SymbolContext sym_ctx;
+    sc_list.GetContextAtIndex(index, sym_ctx);
+
+    // We don't know enough about symbols to compare them, but we should
+    // keep them in the list.
+    Function *function = sym_ctx.function;
+    if (!function) {
+      sc_sym_list.Append(sym_ctx);
+      continue;
+    }
+    // Filter out functions without declaration contexts, as well as
+    // class/instance methods, since they'll be skipped in the code that
+    // follows anyway.
+    CompilerDeclContext func_decl_context = function->GetDeclContext();
+    if (!func_decl_context ||
+        func_decl_context.IsClassMethod(nullptr, nullptr, nullptr))
+      continue;
+    // We can only prune functions for which we can copy the type.
+    CompilerType func_clang_type = function->GetType()->GetFullCompilerType();
+    CompilerType copied_func_type = GuardedCopyType(func_clang_type);
+    if (!copied_func_type) {
+      sc_sym_list.Append(sym_ctx);
+      continue;
+    }
+
+    fdi.m_sym_ctx = sym_ctx;
+    fdi.m_name = function->GetName();
+    fdi.m_copied_type = copied_func_type;
+    fdi.m_decl_lvl = LLDB_INVALID_DECL_LEVEL;
+    if (fdi.m_copied_type && func_decl_context) {
+      // Call CountDeclLevels to get the number of parent scopes we have
+      // to look through before we find the function declaration. When
+      // comparing functions of the same type, the one with a lower count
+      // will be closer to us in the lookup scope and shadows the other.
+      clang::DeclContext *func_decl_ctx =
+          (clang::DeclContext *)func_decl_context.GetOpaqueDeclContext();
+      fdi.m_decl_lvl = ast->CountDeclLevels(frame_decl_ctx, func_decl_ctx,
+                                            &fdi.m_name, &fdi.m_copied_type);
+    }
+    decl_infos.emplace_back(fdi);
+  }
+
+  // Loop through the functions in our cache looking for matching types,
+  // then compare their scope levels to see which is closer.
+  std::multimap<CompilerType, const FuncDeclInfo *> matches;
+  for (const FuncDeclInfo &fdi : decl_infos) {
+    const CompilerType t = fdi.m_copied_type;
+    auto q = matches.find(t);
+    if (q != matches.end()) {
+      if (q->second->m_decl_lvl > fdi.m_decl_lvl)
+        // This function is closer; remove the old set.
+        matches.erase(t);
+      else if (q->second->m_decl_lvl < fdi.m_decl_lvl)
+        // The functions in our set are closer - skip this one.
+        continue;
+    }
+    matches.insert(std::make_pair(t, &fdi));
+  }
+
+  // Loop through our matches and add their symbol contexts to our list.
+  SymbolContextList sc_func_list;
+  for (const auto &q : matches)
+    sc_func_list.Append(q.second->m_sym_ctx);
+
+  // Rejoin the lists with the functions in front.
+  sc_func_list.Append(sc_sym_list);
+  return sc_func_list;
+}
+
 void ClangExpressionDeclMap::LookupFunction(NameSearchContext &context,
                                             lldb::ModuleSP module_sp,
                                             ConstString name,
@@ -1237,98 +1332,7 @@ void ClangExpressionDeclMap::LookupFunction(NameSearchContext &context,
 
     // We can't do this without a compiler decl context for our frame.
     if (frame_decl_context) {
-      clang::DeclContext *frame_decl_ctx =
-          (clang::DeclContext *)frame_decl_context.GetOpaqueDeclContext();
-      ClangASTContext *ast = llvm::dyn_cast_or_null<ClangASTContext>(
-          frame_decl_context.GetTypeSystem());
-
-      // Structure to hold the info needed when comparing function
-      // declarations.
-      struct FuncDeclInfo {
-        ConstString m_name;
-        CompilerType m_copied_type;
-        uint32_t m_decl_lvl;
-        SymbolContext m_sym_ctx;
-      };
-
-      // First, symplify things by looping through the symbol contexts to
-      // remove unwanted functions and separate out the functions we want to
-      // compare and prune into a separate list. Cache the info needed about
-      // the function declarations in a vector for efficiency.
-      SymbolContextList sc_sym_list;
-      uint32_t num_indices = sc_list.GetSize();
-      std::vector<FuncDeclInfo> fdi_cache;
-      fdi_cache.reserve(num_indices);
-      for (uint32_t index = 0; index < num_indices; ++index) {
-        FuncDeclInfo fdi;
-        SymbolContext sym_ctx;
-        sc_list.GetContextAtIndex(index, sym_ctx);
-
-        // We don't know enough about symbols to compare them, but we should
-        // keep them in the list.
-        Function *function = sym_ctx.function;
-        if (!function) {
-          sc_sym_list.Append(sym_ctx);
-          continue;
-        }
-        // Filter out functions without declaration contexts, as well as
-        // class/instance methods, since they'll be skipped in the code that
-        // follows anyway.
-        CompilerDeclContext func_decl_context = function->GetDeclContext();
-        if (!func_decl_context ||
-            func_decl_context.IsClassMethod(nullptr, nullptr, nullptr))
-          continue;
-        // We can only prune functions for which we can copy the type.
-        CompilerType func_clang_type =
-            function->GetType()->GetFullCompilerType();
-        CompilerType copied_func_type = GuardedCopyType(func_clang_type);
-        if (!copied_func_type) {
-          sc_sym_list.Append(sym_ctx);
-          continue;
-        }
-
-        fdi.m_sym_ctx = sym_ctx;
-        fdi.m_name = function->GetName();
-        fdi.m_copied_type = copied_func_type;
-        fdi.m_decl_lvl = LLDB_INVALID_DECL_LEVEL;
-        if (fdi.m_copied_type && func_decl_context) {
-          // Call CountDeclLevels to get the number of parent scopes we have
-          // to look through before we find the function declaration. When
-          // comparing functions of the same type, the one with a lower count
-          // will be closer to us in the lookup scope and shadows the other.
-          clang::DeclContext *func_decl_ctx =
-              (clang::DeclContext *)func_decl_context.GetOpaqueDeclContext();
-          fdi.m_decl_lvl = ast->CountDeclLevels(
-              frame_decl_ctx, func_decl_ctx, &fdi.m_name, &fdi.m_copied_type);
-        }
-        fdi_cache.emplace_back(fdi);
-      }
-
-      // Loop through the functions in our cache looking for matching types,
-      // then compare their scope levels to see which is closer.
-      std::multimap<CompilerType, const FuncDeclInfo *> matches;
-      for (const FuncDeclInfo &fdi : fdi_cache) {
-        const CompilerType t = fdi.m_copied_type;
-        auto q = matches.find(t);
-        if (q != matches.end()) {
-          if (q->second->m_decl_lvl > fdi.m_decl_lvl)
-            // This function is closer; remove the old set.
-            matches.erase(t);
-          else if (q->second->m_decl_lvl < fdi.m_decl_lvl)
-            // The functions in our set are closer - skip this one.
-            continue;
-        }
-        matches.insert(std::make_pair(t, &fdi));
-      }
-
-      // Loop through our matches and add their symbol contexts to our list.
-      SymbolContextList sc_func_list;
-      for (const auto &q : matches)
-        sc_func_list.Append(q.second->m_sym_ctx);
-
-      // Rejoin the lists with the functions in front.
-      sc_list = sc_func_list;
-      sc_list.Append(sc_sym_list);
+      sc_list = SearchFunctionsInSymbolContexts(sc_list, frame_decl_context);
     }
   }
 
@@ -1724,8 +1728,7 @@ void ClangExpressionDeclMap::AddOneGenericVariable(NameSearchContext &context,
   TypeFromUser user_type(scratch_ast_context->GetBasicType(eBasicTypeVoid)
                              .GetPointerType()
                              .GetLValueReferenceType());
-  ClangASTContext *own_context = ClangASTContext::GetASTContext(m_ast_context);
-  TypeFromParser parser_type(own_context->GetBasicType(eBasicTypeVoid)
+  TypeFromParser parser_type(m_clang_ast_context->GetBasicType(eBasicTypeVoid)
                                  .GetPointerType()
                                  .GetLValueReferenceType());
   NamedDecl *var_decl = context.AddVarDecl(parser_type);
@@ -1766,8 +1769,8 @@ void ClangExpressionDeclMap::AddOneRegister(NameSearchContext &context,
   Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS));
 
   CompilerType clang_type =
-      ClangASTContext::GetBuiltinTypeForEncodingAndBitSize(
-          m_ast_context, reg_info->encoding, reg_info->byte_size * 8);
+      m_clang_ast_context->GetBuiltinTypeForEncodingAndBitSize(
+          reg_info->encoding, reg_info->byte_size * 8);
 
   if (!clang_type) {
     LLDB_LOGF(log, "  Tried to add a type for %s, but couldn't get one",
@@ -2003,9 +2006,8 @@ void ClangExpressionDeclMap::AddThisType(NameSearchContext &context,
 
   if (copied_clang_type.IsAggregateType() &&
       copied_clang_type.GetCompleteType()) {
-    ClangASTContext *own_context =
-        ClangASTContext::GetASTContext(m_ast_context);
-    CompilerType void_clang_type = own_context->GetBasicType(eBasicTypeVoid);
+    CompilerType void_clang_type =
+        m_clang_ast_context->GetBasicType(eBasicTypeVoid);
     CompilerType void_ptr_clang_type = void_clang_type.GetPointerType();
 
     CompilerType method_type = ClangASTContext::CreateFunctionType(
@@ -2018,12 +2020,10 @@ void ClangExpressionDeclMap::AddThisType(NameSearchContext &context,
     const bool is_attr_used = true;
     const bool is_artificial = false;
 
-    CXXMethodDecl *method_decl =
-        ClangASTContext::GetASTContext(m_ast_context)
-            ->AddMethodToCXXRecordType(
-                copied_clang_type.GetOpaqueQualType(), "$__lldb_expr", nullptr,
-                method_type, lldb::eAccessPublic, is_virtual, is_static,
-                is_inline, is_explicit, is_attr_used, is_artificial);
+    CXXMethodDecl *method_decl = m_clang_ast_context->AddMethodToCXXRecordType(
+        copied_clang_type.GetOpaqueQualType(), "$__lldb_expr", nullptr,
+        method_type, lldb::eAccessPublic, is_virtual, is_static, is_inline,
+        is_explicit, is_attr_used, is_artificial);
 
     LLDB_LOG(log,
              "  CEDM::AddThisType Added function $__lldb_expr "
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
index 1f308edf20cf2..5cd16d5d16874 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionDeclMap.h
@@ -458,6 +458,23 @@ class ClangExpressionDeclMap : public ClangASTSource {
                            unsigned current_id, SymbolContext &sym_ctx,
                            CompilerDeclContext &namespace_decl);
 
+  /// Searches for functions in the given SymbolContextList.
+  ///
+  /// \param[in] sc_list
+  ///     The SymbolContextList to search.
+  ///
+  /// \param[in] frame_decl_context
+  ///     The current DeclContext of the current frame.
+  ///
+  /// \return
+  ///     A SymbolContextList with any found functions in the front and
+  ///     any unknown SymbolContexts which are not functions in the back.
+  ///     The SymbolContexts for the functions are ordered by how close they are
+  ///     to the DeclContext for the given frame DeclContext.
+  SymbolContextList SearchFunctionsInSymbolContexts(
+      const SymbolContextList &sc_list,
+      const CompilerDeclContext &frame_decl_context);
+
   /// Looks up a function.
   ///
   /// \param[in] context
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index a0f966ddd5111..15b242a8b87ee 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -997,7 +997,7 @@ ClangExpressionParser::ParseInternal(DiagnosticManager &diagnostic_manager,
     } else {
       ast_context.setExternalSource(ast_source);
     }
-    decl_map->InstallASTContext(ast_context, m_compiler->getFileManager());
+    decl_map->InstallASTContext(*m_ast_context, m_compiler->getFileManager());
   }
 
   // Check that the ASTReader is properly attached to ASTContext and Sema.
diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
index 19a987b0f0042..ff142e6f35ff2 100644
--- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
+++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp
@@ -850,6 +850,7 @@ uint32_t EmulateInstructionARM::GetFramePointerRegisterNumber() const {
 
   /* On Apple iOS et al, the frame pointer register is always r7.
    * Typically on other ARM systems, thumb code uses r7; arm code uses r11.
+   * Windows on ARM, which is in thumb mode, uses r11 though.
    */
 
   uint32_t fp_regnum = 11;
@@ -857,7 +858,7 @@ uint32_t EmulateInstructionARM::GetFramePointerRegisterNumber() const {
   if (is_apple)
     fp_regnum = 7;
 
-  if (m_opcode_mode == eModeThumb)
+  if (m_opcode_mode == eModeThumb && !m_arch.GetTriple().isOSWindows())
     fp_regnum = 7;
 
   return fp_regnum;
@@ -879,6 +880,7 @@ uint32_t EmulateInstructionARM::GetFramePointerDWARFRegisterNumber() const {
 
   /* On Apple iOS et al, the frame pointer register is always r7.
    * Typically on other ARM systems, thumb code uses r7; arm code uses r11.
+   * Windows on ARM, which is in thumb mode, uses r11 though.
    */
 
   uint32_t fp_regnum = dwarf_r11;
@@ -886,7 +888,7 @@ uint32_t EmulateInstructionARM::GetFramePointerDWARFRegisterNumber() const {
   if (is_apple)
     fp_regnum = dwarf_r7;
 
-  if (m_opcode_mode == eModeThumb)
+  if (m_opcode_mode == eModeThumb && !m_arch.GetTriple().isOSWindows())
     fp_regnum = dwarf_r7;
 
   return fp_regnum;
@@ -1343,6 +1345,8 @@ bool EmulateInstructionARM::EmulateMOVRdRm(const uint32_t opcode,
     EmulateInstruction::Context context;
     if (Rd == 13)
       context.type = EmulateInstruction::eContextAdjustStackPointer;
+    else if (Rd == GetFramePointerRegisterNumber() && Rm == 13)
+      context.type = EmulateInstruction::eContextSetFramePointer;
     else
       context.type = EmulateInstruction::eContextRegisterPlusOffset;
     RegisterInfo dwarf_reg;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index c22f4ae9e41a9..4385a60f58623 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -284,46 +284,34 @@ class NodeAllocator {
   }
 };
 
-/// Given a mangled function `Mangled`, replace all the primitive function type
-/// arguments of `Search` with type `Replace`.
-class TypeSubstitutor
-    : public llvm::itanium_demangle::AbstractManglingParser<TypeSubstitutor,
+template <typename Derived>
+class ManglingSubstitutor
+    : public llvm::itanium_demangle::AbstractManglingParser<Derived,
                                                             NodeAllocator> {
-  /// Input character until which we have constructed the respective output
-  /// already
-  const char *Written;
+  using Base =
+      llvm::itanium_demangle::AbstractManglingParser<Derived, NodeAllocator>;
 
-  llvm::StringRef Search;
-  llvm::StringRef Replace;
-  llvm::SmallString<128> Result;
+public:
+  ManglingSubstitutor() : Base(nullptr, nullptr) {}
 
-  /// Whether we have performed any substitutions.
-  bool Substituted;
+  template<typename... Ts>
+  ConstString substitute(llvm::StringRef Mangled, Ts &&... Vals) {
+    this->getDerived().reset(Mangled, std::forward<Ts>(Vals)...);
+    return substituteImpl(Mangled);
+  }
 
-  void reset(llvm::StringRef Mangled, llvm::StringRef Search,
-             llvm::StringRef Replace) {
-    AbstractManglingParser::reset(Mangled.begin(), Mangled.end());
+
+protected:
+  void reset(llvm::StringRef Mangled) {
+    Base::reset(Mangled.begin(), Mangled.end());
     Written = Mangled.begin();
-    this->Search = Search;
-    this->Replace = Replace;
     Result.clear();
     Substituted = false;
   }
 
-  void appendUnchangedInput() {
-    Result += llvm::StringRef(Written, First - Written);
-    Written = First;
-  }
-
-public:
-  TypeSubstitutor() : AbstractManglingParser(nullptr, nullptr) {}
-
-  ConstString substitute(llvm::StringRef Mangled, llvm::StringRef From,
-                         llvm::StringRef To) {
+  ConstString substituteImpl(llvm::StringRef Mangled) {
     Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_LANGUAGE);
-
-    reset(Mangled, From, To);
-    if (parse() == nullptr) {
+    if (this->parse() == nullptr) {
       LLDB_LOG(log, "Failed to substitute mangling in {0}", Mangled);
       return ConstString();
     }
@@ -336,20 +324,69 @@ class TypeSubstitutor
     return ConstString(Result);
   }
 
+  void trySubstitute(llvm::StringRef From, llvm::StringRef To) {
+    if (!llvm::StringRef(currentParserPos(), this->numLeft()).startswith(From))
+      return;
+
+    // We found a match. Append unmodified input up to this point.
+    appendUnchangedInput();
+
+    // And then perform the replacement.
+    Result += To;
+    Written += From.size();
+    Substituted = true;
+  }
+
+private:
+  /// Input character until which we have constructed the respective output
+  /// already.
+  const char *Written;
+
+  llvm::SmallString<128> Result;
+
+  /// Whether we have performed any substitutions.
+  bool Substituted;
+
+  const char *currentParserPos() const { return this->First; }
+
+  void appendUnchangedInput() {
+    Result +=
+        llvm::StringRef(Written, std::distance(Written, currentParserPos()));
+    Written = currentParserPos();
+  }
+
+};
+
+/// Given a mangled function `Mangled`, replace all the primitive function type
+/// arguments of `Search` with type `Replace`.
+class TypeSubstitutor : public ManglingSubstitutor<TypeSubstitutor> {
+  llvm::StringRef Search;
+  llvm::StringRef Replace;
+
+public:
+  void reset(llvm::StringRef Mangled, llvm::StringRef Search,
+             llvm::StringRef Replace) {
+    ManglingSubstitutor::reset(Mangled);
+    this->Search = Search;
+    this->Replace = Replace;
+  }
+
   llvm::itanium_demangle::Node *parseType() {
-    if (llvm::StringRef(First, numLeft()).startswith(Search)) {
-      // We found a match. Append unmodified input up to this point.
-      appendUnchangedInput();
-
-      // And then perform the replacement.
-      Result += Replace;
-      Written += Search.size();
-      Substituted = true;
-    }
-    return AbstractManglingParser::parseType();
+    trySubstitute(Search, Replace);
+    return ManglingSubstitutor::parseType();
   }
 };
-}
+
+class CtorDtorSubstitutor : public ManglingSubstitutor<CtorDtorSubstitutor> {
+public:
+  llvm::itanium_demangle::Node *
+  parseCtorDtorName(llvm::itanium_demangle::Node *&SoFar, NameState *State) {
+    trySubstitute("C1", "C2");
+    trySubstitute("D1", "D2");
+    return ManglingSubstitutor::parseCtorDtorName(SoFar, State);
+  }
+};
+} // namespace
 
 uint32_t CPlusPlusLanguage::FindAlternateFunctionManglings(
     const ConstString mangled_name, std::set<ConstString> &alternates) {
@@ -397,6 +434,10 @@ uint32_t CPlusPlusLanguage::FindAlternateFunctionManglings(
           TS.substitute(mangled_name.GetStringRef(), "y", "m"))
     alternates.insert(ulong_fixup);
 
+  if (ConstString ctor_fixup =
+          CtorDtorSubstitutor().substitute(mangled_name.GetStringRef()))
+    alternates.insert(ctor_fixup);
+
   return alternates.size() - start_size;
 }
 
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index 619c718a1c1b9..f6d8d4d9a7eb9 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -94,6 +94,8 @@ class MapIterator {
   MapIterator(ValueObject *entry, size_t depth = 0)
       : m_entry(entry), m_max_depth(depth), m_error(false) {}
 
+  MapIterator &operator=(const MapIterator &) = default;
+
   ValueObjectSP value() { return m_entry.GetEntry(); }
 
   ValueObjectSP advance(size_t count) {
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
index b4d8ba2218a17..d556aae1c458c 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
@@ -62,7 +62,7 @@ bool CPPLanguageRuntime::GetObjectDescription(
 
 bool contains_lambda_identifier(llvm::StringRef &str_ref) {
   return str_ref.contains("$_") || str_ref.contains("'lambda'");
-};
+}
 
 CPPLanguageRuntime::LibCppStdFunctionCallableInfo
 line_entry_helper(Target &target, const SymbolContext &sc, Symbol *symbol,
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index 9bdbef393e39f..750b6ce6b0c6a 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -1625,19 +1625,13 @@ AppleObjCRuntimeV2::UpdateISAToDescriptorMapSharedCache() {
     // Substitute in the correct class_getName / class_getNameRaw function name,
     // concatenate the two parts of our expression text.  The format string
     // has two %s's, so provide the name twice.
-    int prefix_string_size = snprintf (nullptr, 0, 
+    std::string shared_class_expression;
+    llvm::raw_string_ostream(shared_class_expression) << llvm::format(
                                g_shared_cache_class_name_funcptr,
                                class_name_getter_function_name.AsCString(),
                                class_name_getter_function_name.AsCString());
 
-    char *class_name_func_ptr_expr = (char*) malloc (prefix_string_size + 1);
-    snprintf (class_name_func_ptr_expr, prefix_string_size + 1,
-              g_shared_cache_class_name_funcptr,
-              class_name_getter_function_name.AsCString(),
-              class_name_getter_function_name.AsCString());
-    std::string shared_class_expression = class_name_func_ptr_expr;
     shared_class_expression += g_get_shared_cache_class_info_body;
-    free (class_name_func_ptr_expr);
 
     m_get_shared_cache_class_info_code.reset(
         GetTargetRef().GetUtilityFunctionForLanguage(
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 3f8502548fc25..8eadaf1323d55 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -1581,6 +1581,7 @@ static SectionType GetSectionTypeFromName(llvm::StringRef Name) {
         .Case("pubtypes", eSectionTypeDWARFDebugPubTypes)
         .Case("ranges", eSectionTypeDWARFDebugRanges)
         .Case("rnglists", eSectionTypeDWARFDebugRngLists)
+        .Case("rnglists.dwo", eSectionTypeDWARFDebugRngListsDwo)
         .Case("str", eSectionTypeDWARFDebugStr)
         .Case("str.dwo", eSectionTypeDWARFDebugStrDwo)
         .Case("str_offsets", eSectionTypeDWARFDebugStrOffsets)
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index aff1d1e87bb67..57c43de0c945d 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -1140,6 +1140,7 @@ AddressClass ObjectFileMachO::GetAddressClass(lldb::addr_t file_addr) {
         case eSectionTypeDWARFDebugPubTypes:
         case eSectionTypeDWARFDebugRanges:
         case eSectionTypeDWARFDebugRngLists:
+        case eSectionTypeDWARFDebugRngListsDwo:
         case eSectionTypeDWARFDebugStr:
         case eSectionTypeDWARFDebugStrDwo:
         case eSectionTypeDWARFDebugStrOffsets:
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 37e1120838f37..b0ce967a79665 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -114,9 +114,10 @@ const char *ObjectFilePECOFF::GetPluginDescriptionStatic() {
 ObjectFile *ObjectFilePECOFF::CreateInstance(const lldb::ModuleSP &module_sp,
                                              DataBufferSP &data_sp,
                                              lldb::offset_t data_offset,
-                                             const lldb_private::FileSpec *file,
+                                             const lldb_private::FileSpec *file_p,
                                              lldb::offset_t file_offset,
                                              lldb::offset_t length) {
+  FileSpec file = file_p ? *file_p : FileSpec();
   if (!data_sp) {
     data_sp = MapFileData(file, length, file_offset);
     if (!data_sp)
@@ -135,7 +136,7 @@ ObjectFile *ObjectFilePECOFF::CreateInstance(const lldb::ModuleSP &module_sp,
   }
 
   auto objfile_up = std::make_unique<ObjectFilePECOFF>(
-      module_sp, data_sp, data_offset, file, file_offset, length);
+      module_sp, data_sp, data_offset, file_p, file_offset, length);
   if (!objfile_up || !objfile_up->ParseHeader())
     return nullptr;
 
@@ -787,6 +788,77 @@ bool ObjectFilePECOFF::IsStripped() {
   return false;
 }
 
+SectionType ObjectFilePECOFF::GetSectionType(llvm::StringRef sect_name,
+                                             const section_header_t &sect) {
+  ConstString const_sect_name(sect_name);
+  static ConstString g_code_sect_name(".code");
+  static ConstString g_CODE_sect_name("CODE");
+  static ConstString g_data_sect_name(".data");
+  static ConstString g_DATA_sect_name("DATA");
+  static ConstString g_bss_sect_name(".bss");
+  static ConstString g_BSS_sect_name("BSS");
+
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_CODE &&
+      ((const_sect_name == g_code_sect_name) ||
+       (const_sect_name == g_CODE_sect_name))) {
+    return eSectionTypeCode;
+  }
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA &&
+             ((const_sect_name == g_data_sect_name) ||
+              (const_sect_name == g_DATA_sect_name))) {
+    if (sect.size == 0 && sect.offset == 0)
+      return eSectionTypeZeroFill;
+    else
+      return eSectionTypeData;
+  }
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA &&
+             ((const_sect_name == g_bss_sect_name) ||
+              (const_sect_name == g_BSS_sect_name))) {
+    if (sect.size == 0)
+      return eSectionTypeZeroFill;
+    else
+      return eSectionTypeData;
+  }
+
+  SectionType section_type =
+      llvm::StringSwitch<SectionType>(sect_name)
+          .Case(".debug", eSectionTypeDebug)
+          .Case(".stabstr", eSectionTypeDataCString)
+          .Case(".reloc", eSectionTypeOther)
+          .Case(".debug_abbrev", eSectionTypeDWARFDebugAbbrev)
+          .Case(".debug_aranges", eSectionTypeDWARFDebugAranges)
+          .Case(".debug_frame", eSectionTypeDWARFDebugFrame)
+          .Case(".debug_info", eSectionTypeDWARFDebugInfo)
+          .Case(".debug_line", eSectionTypeDWARFDebugLine)
+          .Case(".debug_loc", eSectionTypeDWARFDebugLoc)
+          .Case(".debug_loclists", eSectionTypeDWARFDebugLocLists)
+          .Case(".debug_macinfo", eSectionTypeDWARFDebugMacInfo)
+          .Case(".debug_names", eSectionTypeDWARFDebugNames)
+          .Case(".debug_pubnames", eSectionTypeDWARFDebugPubNames)
+          .Case(".debug_pubtypes", eSectionTypeDWARFDebugPubTypes)
+          .Case(".debug_ranges", eSectionTypeDWARFDebugRanges)
+          .Case(".debug_str", eSectionTypeDWARFDebugStr)
+          .Case(".debug_types", eSectionTypeDWARFDebugTypes)
+          // .eh_frame can be truncated to 8 chars.
+          .Cases(".eh_frame", ".eh_fram", eSectionTypeEHFrame)
+          .Case(".gosymtab", eSectionTypeGoSymtab)
+          .Default(eSectionTypeInvalid);
+  if (section_type != eSectionTypeInvalid)
+    return section_type;
+
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_CODE)
+    return eSectionTypeCode;
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
+    return eSectionTypeData;
+  if (sect.flags & llvm::COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) {
+    if (sect.size == 0)
+      return eSectionTypeZeroFill;
+    else
+      return eSectionTypeData;
+  }
+  return eSectionTypeOther;
+}
+
 void ObjectFilePECOFF::CreateSections(SectionList &unified_section_list) {
   if (m_sections_up)
     return;
@@ -810,104 +882,9 @@ void ObjectFilePECOFF::CreateSections(SectionList &unified_section_list) {
     const uint32_t nsects = m_sect_headers.size();
     ModuleSP module_sp(GetModule());
     for (uint32_t idx = 0; idx < nsects; ++idx) {
-      ConstString const_sect_name(GetSectionName(m_sect_headers[idx]));
-      static ConstString g_code_sect_name(".code");
-      static ConstString g_CODE_sect_name("CODE");
-      static ConstString g_data_sect_name(".data");
-      static ConstString g_DATA_sect_name("DATA");
-      static ConstString g_bss_sect_name(".bss");
-      static ConstString g_BSS_sect_name("BSS");
-      static ConstString g_debug_sect_name(".debug");
-      static ConstString g_reloc_sect_name(".reloc");
-      static ConstString g_stab_sect_name(".stab");
-      static ConstString g_stabstr_sect_name(".stabstr");
-      static ConstString g_sect_name_dwarf_debug_abbrev(".debug_abbrev");
-      static ConstString g_sect_name_dwarf_debug_aranges(".debug_aranges");
-      static ConstString g_sect_name_dwarf_debug_frame(".debug_frame");
-      static ConstString g_sect_name_dwarf_debug_info(".debug_info");
-      static ConstString g_sect_name_dwarf_debug_line(".debug_line");
-      static ConstString g_sect_name_dwarf_debug_loc(".debug_loc");
-      static ConstString g_sect_name_dwarf_debug_loclists(".debug_loclists");
-      static ConstString g_sect_name_dwarf_debug_macinfo(".debug_macinfo");
-      static ConstString g_sect_name_dwarf_debug_names(".debug_names");
-      static ConstString g_sect_name_dwarf_debug_pubnames(".debug_pubnames");
-      static ConstString g_sect_name_dwarf_debug_pubtypes(".debug_pubtypes");
-      static ConstString g_sect_name_dwarf_debug_ranges(".debug_ranges");
-      static ConstString g_sect_name_dwarf_debug_str(".debug_str");
-      static ConstString g_sect_name_dwarf_debug_types(".debug_types");
-      static ConstString g_sect_name_eh_frame(".eh_frame");
-      static ConstString g_sect_name_go_symtab(".gosymtab");
-      SectionType section_type = eSectionTypeOther;
-      if (m_sect_headers[idx].flags & llvm::COFF::IMAGE_SCN_CNT_CODE &&
-          ((const_sect_name == g_code_sect_name) ||
-           (const_sect_name == g_CODE_sect_name))) {
-        section_type = eSectionTypeCode;
-      } else if (m_sect_headers[idx].flags &
-                     llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA &&
-                 ((const_sect_name == g_data_sect_name) ||
-                  (const_sect_name == g_DATA_sect_name))) {
-        if (m_sect_headers[idx].size == 0 && m_sect_headers[idx].offset == 0)
-          section_type = eSectionTypeZeroFill;
-        else
-          section_type = eSectionTypeData;
-      } else if (m_sect_headers[idx].flags &
-                     llvm::COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA &&
-                 ((const_sect_name == g_bss_sect_name) ||
-                  (const_sect_name == g_BSS_sect_name))) {
-        if (m_sect_headers[idx].size == 0)
-          section_type = eSectionTypeZeroFill;
-        else
-          section_type = eSectionTypeData;
-      } else if (const_sect_name == g_debug_sect_name) {
-        section_type = eSectionTypeDebug;
-      } else if (const_sect_name == g_stabstr_sect_name) {
-        section_type = eSectionTypeDataCString;
-      } else if (const_sect_name == g_reloc_sect_name) {
-        section_type = eSectionTypeOther;
-      } else if (const_sect_name == g_sect_name_dwarf_debug_abbrev)
-        section_type = eSectionTypeDWARFDebugAbbrev;
-      else if (const_sect_name == g_sect_name_dwarf_debug_aranges)
-        section_type = eSectionTypeDWARFDebugAranges;
-      else if (const_sect_name == g_sect_name_dwarf_debug_frame)
-        section_type = eSectionTypeDWARFDebugFrame;
-      else if (const_sect_name == g_sect_name_dwarf_debug_info)
-        section_type = eSectionTypeDWARFDebugInfo;
-      else if (const_sect_name == g_sect_name_dwarf_debug_line)
-        section_type = eSectionTypeDWARFDebugLine;
-      else if (const_sect_name == g_sect_name_dwarf_debug_loc)
-        section_type = eSectionTypeDWARFDebugLoc;
-      else if (const_sect_name == g_sect_name_dwarf_debug_loclists)
-        section_type = eSectionTypeDWARFDebugLocLists;
-      else if (const_sect_name == g_sect_name_dwarf_debug_macinfo)
-        section_type = eSectionTypeDWARFDebugMacInfo;
-      else if (const_sect_name == g_sect_name_dwarf_debug_names)
-        section_type = eSectionTypeDWARFDebugNames;
-      else if (const_sect_name == g_sect_name_dwarf_debug_pubnames)
-        section_type = eSectionTypeDWARFDebugPubNames;
-      else if (const_sect_name == g_sect_name_dwarf_debug_pubtypes)
-        section_type = eSectionTypeDWARFDebugPubTypes;
-      else if (const_sect_name == g_sect_name_dwarf_debug_ranges)
-        section_type = eSectionTypeDWARFDebugRanges;
-      else if (const_sect_name == g_sect_name_dwarf_debug_str)
-        section_type = eSectionTypeDWARFDebugStr;
-      else if (const_sect_name == g_sect_name_dwarf_debug_types)
-        section_type = eSectionTypeDWARFDebugTypes;
-      else if (const_sect_name == g_sect_name_eh_frame)
-        section_type = eSectionTypeEHFrame;
-      else if (const_sect_name == g_sect_name_go_symtab)
-        section_type = eSectionTypeGoSymtab;
-      else if (m_sect_headers[idx].flags & llvm::COFF::IMAGE_SCN_CNT_CODE) {
-        section_type = eSectionTypeCode;
-      } else if (m_sect_headers[idx].flags &
-                 llvm::COFF::IMAGE_SCN_CNT_INITIALIZED_DATA) {
-        section_type = eSectionTypeData;
-      } else if (m_sect_headers[idx].flags &
-                 llvm::COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA) {
-        if (m_sect_headers[idx].size == 0)
-          section_type = eSectionTypeZeroFill;
-        else
-          section_type = eSectionTypeData;
-      }
+      llvm::StringRef sect_name = GetSectionName(m_sect_headers[idx]);
+      ConstString const_sect_name(sect_name);
+      SectionType section_type = GetSectionType(sect_name, m_sect_headers[idx]);
 
       SectionSP section_sp(new Section(
           module_sp,       // Module to which this section belongs
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 78088ecc43778..c0efe702f5700 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -283,6 +283,8 @@ class ObjectFilePECOFF : public lldb_private::ObjectFile {
   void DumpDependentModules(lldb_private::Stream *s);
 
   llvm::StringRef GetSectionName(const section_header_t &sect);
+  static lldb::SectionType GetSectionType(llvm::StringRef sect_name,
+                                          const section_header_t &sect);
 
   typedef std::vector<section_header_t> SectionHeaderColl;
   typedef SectionHeaderColl::iterator SectionHeaderCollIter;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 6a3e6b4cadefc..ae9f20db43cc2 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -1106,7 +1106,7 @@ static FileSpec GetXcodeSelectPath() {
       std::string command_output;
       Status status =
           Host::RunShellCommand("/usr/bin/xcode-select --print-path",
-                                nullptr, // current working directory
+                                FileSpec(), // current working directory
                                 &exit_status, &signo, &command_output,
                                 std::chrono::seconds(2), // short timeout
                                 false);                  // don't run in a shell
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
index 95ba81a2ab493..134a4c7c80759 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformMacOSX.cpp
@@ -180,11 +180,11 @@ ConstString PlatformMacOSX::GetSDKDirectory(lldb_private::Target &target) {
             std::string output;
             const char *command = "xcrun -sdk macosx --show-sdk-path";
             lldb_private::Status error = RunShellCommand(
-                command, // shell command to run
-                nullptr, // current working directory
-                &status, // Put the exit status of the process in here
-                &signo,  // Put the signal that caused the process to exit in
-                         // here
+                command,    // shell command to run
+                FileSpec(), // current working directory
+                &status,    // Put the exit status of the process in here
+                &signo,     // Put the signal that caused the process to exit in
+                            // here
                 &output, // Get the output from the command and place it in this
                          // string
                 std::chrono::seconds(3));
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
index e9bb29293189d..0aa129c808d43 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformRemoteDarwinDevice.cpp
@@ -449,12 +449,10 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
   Status error;
   char platform_file_path[PATH_MAX];
   if (platform_file.GetPath(platform_file_path, sizeof(platform_file_path))) {
-    char resolved_path[PATH_MAX];
-
     const char *os_version_dir = GetDeviceSupportDirectoryForOSVersion();
     if (os_version_dir) {
-      ::snprintf(resolved_path, sizeof(resolved_path), "%s/%s", os_version_dir,
-                 platform_file_path);
+      std::string resolved_path =
+          (llvm::Twine(os_version_dir) + "/" + platform_file_path).str();
 
       local_file.SetFile(resolved_path, FileSpec::Style::native);
       FileSystem::Instance().Resolve(local_file);
@@ -466,31 +464,28 @@ Status PlatformRemoteDarwinDevice::GetSymbolFile(const FileSpec &platform_file,
         return error;
       }
 
-      ::snprintf(resolved_path, sizeof(resolved_path), "%s/Symbols.Internal/%s",
-                 os_version_dir, platform_file_path);
+      resolved_path = (llvm::Twine(os_version_dir) + "/Symbols.Internal/" +
+                       platform_file_path)
+                          .str();
 
       local_file.SetFile(resolved_path, FileSpec::Style::native);
       FileSystem::Instance().Resolve(local_file);
       if (FileSystem::Instance().Exists(local_file)) {
-        if (log) {
-          LLDB_LOGF(
-              log,
-              "Found a copy of %s in the DeviceSupport dir %s/Symbols.Internal",
-              platform_file_path, os_version_dir);
-        }
+        LLDB_LOGF(
+            log,
+            "Found a copy of %s in the DeviceSupport dir %s/Symbols.Internal",
+            platform_file_path, os_version_dir);
         return error;
       }
-      ::snprintf(resolved_path, sizeof(resolved_path), "%s/Symbols/%s",
-                 os_version_dir, platform_file_path);
+      resolved_path =
+          (llvm::Twine(os_version_dir) + "/Symbols/" + platform_file_path)
+              .str();
 
       local_file.SetFile(resolved_path, FileSpec::Style::native);
       FileSystem::Instance().Resolve(local_file);
       if (FileSystem::Instance().Exists(local_file)) {
-        if (log) {
-          LLDB_LOGF(log,
-                    "Found a copy of %s in the DeviceSupport dir %s/Symbols",
-                    platform_file_path, os_version_dir);
-        }
+        LLDB_LOGF(log, "Found a copy of %s in the DeviceSupport dir %s/Symbols",
+                  platform_file_path, os_version_dir);
         return error;
       }
     }
diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
index b12e21deb4592..f24856bc5b3f6 100644
--- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
+++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp
@@ -223,7 +223,7 @@ static uint32_t chown_file(Platform *platform, const char *path,
     command.Printf(":%d", gid);
   command.Printf("%s", path);
   int status;
-  platform->RunShellCommand(command.GetData(), nullptr, &status, nullptr,
+  platform->RunShellCommand(command.GetData(), FileSpec(), &status, nullptr,
                             nullptr, std::chrono::seconds(10));
   return status;
 }
@@ -235,7 +235,7 @@ PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
   Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PLATFORM));
 
   if (IsHost()) {
-    if (FileSpec::Equal(source, destination, true))
+    if (source == destination)
       return Status();
     // cp src dst
     // chown uid:gid dst
@@ -248,7 +248,7 @@ PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
     StreamString command;
     command.Printf("cp %s %s", src_path.c_str(), dst_path.c_str());
     int status;
-    RunShellCommand(command.GetData(), nullptr, &status, nullptr, nullptr,
+    RunShellCommand(command.GetData(), FileSpec(), &status, nullptr, nullptr,
                     std::chrono::seconds(10));
     if (status != 0)
       return Status("unable to perform copy");
@@ -278,7 +278,7 @@ PlatformPOSIX::PutFile(const lldb_private::FileSpec &source,
                        GetHostname(), dst_path.c_str());
       LLDB_LOGF(log, "[PutFile] Running command: %s\n", command.GetData());
       int retcode;
-      Host::RunShellCommand(command.GetData(), nullptr, &retcode, nullptr,
+      Host::RunShellCommand(command.GetData(), FileSpec(), &retcode, nullptr,
                             nullptr, std::chrono::minutes(1));
       if (retcode == 0) {
         // Don't chown a local file for a remote system
@@ -307,14 +307,14 @@ lldb_private::Status PlatformPOSIX::GetFile(
   if (dst_path.empty())
     return Status("unable to get file path for destination");
   if (IsHost()) {
-    if (FileSpec::Equal(source, destination, true))
+    if (source == destination)
       return Status("local scenario->source and destination are the same file "
                     "path: no operation performed");
     // cp src dst
     StreamString cp_command;
     cp_command.Printf("cp %s %s", src_path.c_str(), dst_path.c_str());
     int status;
-    RunShellCommand(cp_command.GetData(), nullptr, &status, nullptr, nullptr,
+    RunShellCommand(cp_command.GetData(), FileSpec(), &status, nullptr, nullptr,
                     std::chrono::seconds(10));
     if (status != 0)
       return Status("unable to perform copy");
@@ -335,7 +335,7 @@ lldb_private::Status PlatformPOSIX::GetFile(
                        dst_path.c_str());
       LLDB_LOGF(log, "[GetFile] Running command: %s\n", command.GetData());
       int retcode;
-      Host::RunShellCommand(command.GetData(), nullptr, &retcode, nullptr,
+      Host::RunShellCommand(command.GetData(), FileSpec(), &retcode, nullptr,
                             nullptr, std::chrono::minutes(1));
       if (retcode == 0)
         return Status();
diff --git a/lldb/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
index 28e7a590ff9f3..c3cb45530f2ad 100644
--- a/lldb/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/RegisterContextWindows.cpp
@@ -154,15 +154,8 @@ bool RegisterContextWindows::CacheAllRegisterValues() {
     return true;
 
   TargetThreadWindows &wthread = static_cast<TargetThreadWindows &>(m_thread);
-  uint8_t buffer[2048];
-  memset(buffer, 0, sizeof(buffer));
-  PCONTEXT tmpContext = NULL;
-  DWORD contextLength = (DWORD)sizeof(buffer);
-  if (!::InitializeContext(buffer, kWinContextFlags, &tmpContext,
-                           &contextLength)) {
-    return false;
-  }
-  memcpy(&m_context, tmpContext, sizeof(m_context));
+  memset(&m_context, 0, sizeof(m_context));
+  m_context.ContextFlags = kWinContextFlags;
   if (::SuspendThread(
           wthread.GetHostThread().GetNativeThread().GetSystemHandle()) ==
       (DWORD)-1) {
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 5d1dd79c2ffa7..dfef06aa6eafb 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -388,36 +388,6 @@ bool ProcessGDBRemote::ParsePythonTargetDefinition(
   return false;
 }
 
-// If the remote stub didn't give us eh_frame or DWARF register numbers for a
-// register, see if the ABI can provide them.
-// DWARF and eh_frame register numbers are defined as a part of the ABI.
-static void AugmentRegisterInfoViaABI(RegisterInfo &reg_info,
-                                      ConstString reg_name, ABISP abi_sp) {
-  if (reg_info.kinds[eRegisterKindEHFrame] == LLDB_INVALID_REGNUM ||
-      reg_info.kinds[eRegisterKindDWARF] == LLDB_INVALID_REGNUM) {
-    if (abi_sp) {
-      RegisterInfo abi_reg_info;
-      if (abi_sp->GetRegisterInfoByName(reg_name, abi_reg_info)) {
-        if (reg_info.kinds[eRegisterKindEHFrame] == LLDB_INVALID_REGNUM &&
-            abi_reg_info.kinds[eRegisterKindEHFrame] != LLDB_INVALID_REGNUM) {
-          reg_info.kinds[eRegisterKindEHFrame] =
-              abi_reg_info.kinds[eRegisterKindEHFrame];
-        }
-        if (reg_info.kinds[eRegisterKindDWARF] == LLDB_INVALID_REGNUM &&
-            abi_reg_info.kinds[eRegisterKindDWARF] != LLDB_INVALID_REGNUM) {
-          reg_info.kinds[eRegisterKindDWARF] =
-              abi_reg_info.kinds[eRegisterKindDWARF];
-        }
-        if (reg_info.kinds[eRegisterKindGeneric] == LLDB_INVALID_REGNUM &&
-            abi_reg_info.kinds[eRegisterKindGeneric] != LLDB_INVALID_REGNUM) {
-          reg_info.kinds[eRegisterKindGeneric] =
-              abi_reg_info.kinds[eRegisterKindGeneric];
-        }
-      }
-    }
-  }
-}
-
 static size_t SplitCommaSeparatedRegisterNumberString(
     const llvm::StringRef &comma_separated_regiter_numbers,
     std::vector<uint32_t> &regnums, int base) {
@@ -615,12 +585,12 @@ void ProcessGDBRemote::BuildDynamicRegisterInfo(bool force) {
           reg_info.invalidate_regs = invalidate_regs.data();
         }
 
+        reg_info.name = reg_name.AsCString();
         // We have to make a temporary ABI here, and not use the GetABI because
         // this code gets called in DidAttach, when the target architecture
         // (and consequently the ABI we'll get from the process) may be wrong.
-        ABISP abi_to_use = ABI::FindPlugin(shared_from_this(), arch_to_use);
-
-        AugmentRegisterInfoViaABI(reg_info, reg_name, abi_to_use);
+        if (ABISP abi_sp = ABI::FindPlugin(shared_from_this(), arch_to_use))
+          abi_sp->AugmentRegisterInfo(reg_info);
 
         m_register_info.AddRegister(reg_info, reg_name, alt_name, set_name);
       } else {
@@ -4483,7 +4453,9 @@ bool ParseRegisters(XMLNode feature_node, GdbServerTargetInfo &target_info,
         }
 
         ++cur_reg_num;
-        AugmentRegisterInfoViaABI(reg_info, reg_name, abi_sp);
+        reg_info.name = reg_name.AsCString();
+        if (abi_sp)
+          abi_sp->AugmentRegisterInfo(reg_info);
         dyn_reg_info.AddRegister(reg_info, reg_name, alt_name, set_name);
 
         return true; // Keep iterating through all "reg" elements
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 5ed01cf479344..f6b918399cdc7 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -77,11 +77,23 @@ extern "C" void init_lldb(void);
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
 
+// Disable warning C4190: 'LLDBSwigPythonBreakpointCallbackFunction' has
+// C-linkage specified, but returns UDT 'llvm::Expected<bool>' which is
+// incompatible with C
+#if _MSC_VER
+#pragma warning (push)
+#pragma warning (disable : 4190)
+#endif
+
 extern "C" llvm::Expected<bool> LLDBSwigPythonBreakpointCallbackFunction(
     const char *python_function_name, const char *session_dictionary_name,
     const lldb::StackFrameSP &sb_frame,
     const lldb::BreakpointLocationSP &sb_bp_loc, StructuredDataImpl *args_impl);
 
+#if _MSC_VER
+#pragma warning (pop)
+#endif
+
 #pragma clang diagnostic pop
 
 extern "C" bool LLDBSwigPythonWatchpointCallbackFunction(
diff --git a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
index 29d2e8a0c6a84..b2c4d08833414 100644
--- a/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
+++ b/lldb/source/Plugins/SymbolFile/Breakpad/SymbolFileBreakpad.cpp
@@ -731,7 +731,7 @@ void SymbolFileBreakpad::ParseLineTableAndSupportFiles(CompileUnit &cu,
   }
   if (next_addr)
     finish_sequence();
-  data.support_files = map.translate(cu, *m_files);
+  data.support_files = map.translate(cu.GetPrimaryFile(), *m_files);
 }
 
 void SymbolFileBreakpad::ParseUnwindData() {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 01655f04c4223..09f5b28449cb1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -463,13 +463,8 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
 
   const dw_tag_t tag = die.Tag();
 
-  Type::ResolveState resolve_state = Type::ResolveState::Unresolved;
-
-  Type::EncodingDataType encoding_data_type = Type::eEncodingIsUID;
-  CompilerType clang_type;
-
   TypeSP type_sp;
-  LanguageType cu_language = die.GetLanguage();
+
   switch (tag) {
   case DW_TAG_typedef:
   case DW_TAG_base_type:
@@ -480,844 +475,888 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc,
   case DW_TAG_restrict_type:
   case DW_TAG_volatile_type:
   case DW_TAG_unspecified_type: {
-    if (tag == DW_TAG_typedef && attrs.type.IsValid()) {
-      // Try to parse a typedef from the (DWARF embedded in the) Clang
-      // module file first as modules can contain typedef'ed
-      // structures that have no names like:
-      //
-      //  typedef struct { int a; } Foo;
-      //
-      // In this case we will have a structure with no name and a
-      // typedef named "Foo" that points to this unnamed
-      // structure. The name in the typedef is the only identifier for
-      // the struct, so always try to get typedefs from Clang modules
-      // if possible.
-      //
-      // The type_sp returned will be empty if the typedef doesn't
-      // exist in a module file, so it is cheap to call this function
-      // just to check.
-      //
-      // If we don't do this we end up creating a TypeSP that says
-      // this is a typedef to type 0x123 (the DW_AT_type value would
-      // be 0x123 in the DW_TAG_typedef), and this is the unnamed
-      // structure type. We will have a hard time tracking down an
-      // unnammed structure type in the module debug info, so we make
-      // sure we don't get into this situation by always resolving
-      // typedefs from the module.
-      const DWARFDIE encoding_die = attrs.type.Reference();
-
-      // First make sure that the die that this is typedef'ed to _is_
-      // just a declaration (DW_AT_declaration == 1), not a full
-      // definition since template types can't be represented in
-      // modules since only concrete instances of templates are ever
-      // emitted and modules won't contain those
-      if (encoding_die &&
-          encoding_die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0) == 1) {
-        type_sp = ParseTypeFromClangModule(sc, die, log);
-        if (type_sp)
-          return type_sp;
-      }
-    }
+    type_sp = ParseTypeModifier(sc, die, attrs);
+    break;
+  }
 
-    DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\") type => 0x%8.8lx\n",
-                 die.GetID(), DW_TAG_value_to_name(tag), type_name_cstr,
-                 encoding_uid.Reference());
+  case DW_TAG_structure_type:
+  case DW_TAG_union_type:
+  case DW_TAG_class_type: {
+    type_sp = ParseStructureLikeDIE(sc, die, attrs);
+    break;
+  }
 
-    switch (tag) {
-    default:
-      break;
+  case DW_TAG_enumeration_type: {
+    type_sp = ParseEnum(sc, die, attrs);
+    break;
+  }
 
-    case DW_TAG_unspecified_type:
-      if (attrs.name == "nullptr_t" || attrs.name == "decltype(nullptr)") {
-        resolve_state = Type::ResolveState::Full;
-        clang_type = m_ast.GetBasicType(eBasicTypeNullPtr);
-        break;
-      }
-      // Fall through to base type below in case we can handle the type
-      // there...
-      LLVM_FALLTHROUGH;
+  case DW_TAG_inlined_subroutine:
+  case DW_TAG_subprogram:
+  case DW_TAG_subroutine_type: {
+    type_sp = ParseSubroutine(die, attrs);
+    break;
+  }
+  case DW_TAG_array_type: {
+    type_sp = ParseArrayType(die, attrs);
+    break;
+  }
+  case DW_TAG_ptr_to_member_type: {
+    type_sp = ParsePointerToMemberType(die, attrs);
+    break;
+  }
+  default:
+    dwarf->GetObjectFile()->GetModule()->ReportError(
+        "{0x%8.8x}: unhandled type tag 0x%4.4x (%s), please file a bug and "
+        "attach the file at the start of this error message",
+        die.GetOffset(), tag, DW_TAG_value_to_name(tag));
+    break;
+  }
 
-    case DW_TAG_base_type:
-      resolve_state = Type::ResolveState::Full;
-      clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
-          attrs.name.GetCString(), attrs.encoding,
-          attrs.byte_size.getValueOr(0) * 8);
-      break;
+  // TODO: We should consider making the switch above exhaustive to simplify
+  // control flow in ParseTypeFromDWARF. Then, we could simply replace this
+  // return statement with a call to llvm_unreachable.
+  return UpdateSymbolContextScopeForType(sc, die, type_sp);
+}
 
-    case DW_TAG_pointer_type:
-      encoding_data_type = Type::eEncodingIsPointerUID;
-      break;
-    case DW_TAG_reference_type:
-      encoding_data_type = Type::eEncodingIsLValueReferenceUID;
-      break;
-    case DW_TAG_rvalue_reference_type:
-      encoding_data_type = Type::eEncodingIsRValueReferenceUID;
-      break;
-    case DW_TAG_typedef:
-      encoding_data_type = Type::eEncodingIsTypedefUID;
-      break;
-    case DW_TAG_const_type:
-      encoding_data_type = Type::eEncodingIsConstUID;
-      break;
-    case DW_TAG_restrict_type:
-      encoding_data_type = Type::eEncodingIsRestrictUID;
-      break;
-    case DW_TAG_volatile_type:
-      encoding_data_type = Type::eEncodingIsVolatileUID;
+lldb::TypeSP
+DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc,
+                                       const DWARFDIE &die,
+                                       ParsedDWARFTypeAttributes &attrs) {
+  Log *log(LogChannelDWARF::GetLogIfAny(DWARF_LOG_TYPE_COMPLETION |
+                                        DWARF_LOG_LOOKUPS));
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  const dw_tag_t tag = die.Tag();
+  LanguageType cu_language = die.GetLanguage();
+  Type::ResolveState resolve_state = Type::ResolveState::Unresolved;
+  Type::EncodingDataType encoding_data_type = Type::eEncodingIsUID;
+  TypeSP type_sp;
+  CompilerType clang_type;
+
+  if (tag == DW_TAG_typedef && attrs.type.IsValid()) {
+    // Try to parse a typedef from the (DWARF embedded in the) Clang
+    // module file first as modules can contain typedef'ed
+    // structures that have no names like:
+    //
+    //  typedef struct { int a; } Foo;
+    //
+    // In this case we will have a structure with no name and a
+    // typedef named "Foo" that points to this unnamed
+    // structure. The name in the typedef is the only identifier for
+    // the struct, so always try to get typedefs from Clang modules
+    // if possible.
+    //
+    // The type_sp returned will be empty if the typedef doesn't
+    // exist in a module file, so it is cheap to call this function
+    // just to check.
+    //
+    // If we don't do this we end up creating a TypeSP that says
+    // this is a typedef to type 0x123 (the DW_AT_type value would
+    // be 0x123 in the DW_TAG_typedef), and this is the unnamed
+    // structure type. We will have a hard time tracking down an
+    // unnammed structure type in the module debug info, so we make
+    // sure we don't get into this situation by always resolving
+    // typedefs from the module.
+    const DWARFDIE encoding_die = attrs.type.Reference();
+
+    // First make sure that the die that this is typedef'ed to _is_
+    // just a declaration (DW_AT_declaration == 1), not a full
+    // definition since template types can't be represented in
+    // modules since only concrete instances of templates are ever
+    // emitted and modules won't contain those
+    if (encoding_die &&
+        encoding_die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0) == 1) {
+      type_sp = ParseTypeFromClangModule(sc, die, log);
+      if (type_sp)
+        return type_sp;
+    }
+  }
+
+  DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\") type => 0x%8.8lx\n", die.GetID(),
+               DW_TAG_value_to_name(tag), type_name_cstr,
+               encoding_uid.Reference());
+
+  switch (tag) {
+  default:
+    break;
+
+  case DW_TAG_unspecified_type:
+    if (attrs.name == "nullptr_t" || attrs.name == "decltype(nullptr)") {
+      resolve_state = Type::ResolveState::Full;
+      clang_type = m_ast.GetBasicType(eBasicTypeNullPtr);
       break;
     }
+    // Fall through to base type below in case we can handle the type
+    // there...
+    LLVM_FALLTHROUGH;
 
-    if (!clang_type && (encoding_data_type == Type::eEncodingIsPointerUID ||
-                        encoding_data_type == Type::eEncodingIsTypedefUID)) {
-      if (tag == DW_TAG_pointer_type) {
-        DWARFDIE target_die = die.GetReferencedDIE(DW_AT_type);
-
-        if (target_die.GetAttributeValueAsUnsigned(DW_AT_APPLE_block, 0)) {
-          // Blocks have a __FuncPtr inside them which is a pointer to a
-          // function of the proper type.
-
-          for (DWARFDIE child_die = target_die.GetFirstChild();
-               child_die.IsValid(); child_die = child_die.GetSibling()) {
-            if (!strcmp(child_die.GetAttributeValueAsString(DW_AT_name, ""),
-                        "__FuncPtr")) {
-              DWARFDIE function_pointer_type =
-                  child_die.GetReferencedDIE(DW_AT_type);
-
-              if (function_pointer_type) {
-                DWARFDIE function_type =
-                    function_pointer_type.GetReferencedDIE(DW_AT_type);
-
-                bool function_type_is_new_pointer;
-                TypeSP lldb_function_type_sp = ParseTypeFromDWARF(
-                    sc, function_type, &function_type_is_new_pointer);
-
-                if (lldb_function_type_sp) {
-                  clang_type = m_ast.CreateBlockPointerType(
-                      lldb_function_type_sp->GetForwardCompilerType());
-                  encoding_data_type = Type::eEncodingIsUID;
-                  attrs.type.Clear();
-                  resolve_state = Type::ResolveState::Full;
-                }
-              }
+  case DW_TAG_base_type:
+    resolve_state = Type::ResolveState::Full;
+    clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
+        attrs.name.GetCString(), attrs.encoding,
+        attrs.byte_size.getValueOr(0) * 8);
+    break;
 
-              break;
+  case DW_TAG_pointer_type:
+    encoding_data_type = Type::eEncodingIsPointerUID;
+    break;
+  case DW_TAG_reference_type:
+    encoding_data_type = Type::eEncodingIsLValueReferenceUID;
+    break;
+  case DW_TAG_rvalue_reference_type:
+    encoding_data_type = Type::eEncodingIsRValueReferenceUID;
+    break;
+  case DW_TAG_typedef:
+    encoding_data_type = Type::eEncodingIsTypedefUID;
+    break;
+  case DW_TAG_const_type:
+    encoding_data_type = Type::eEncodingIsConstUID;
+    break;
+  case DW_TAG_restrict_type:
+    encoding_data_type = Type::eEncodingIsRestrictUID;
+    break;
+  case DW_TAG_volatile_type:
+    encoding_data_type = Type::eEncodingIsVolatileUID;
+    break;
+  }
+
+  if (!clang_type && (encoding_data_type == Type::eEncodingIsPointerUID ||
+                      encoding_data_type == Type::eEncodingIsTypedefUID)) {
+    if (tag == DW_TAG_pointer_type) {
+      DWARFDIE target_die = die.GetReferencedDIE(DW_AT_type);
+
+      if (target_die.GetAttributeValueAsUnsigned(DW_AT_APPLE_block, 0)) {
+        // Blocks have a __FuncPtr inside them which is a pointer to a
+        // function of the proper type.
+
+        for (DWARFDIE child_die = target_die.GetFirstChild();
+             child_die.IsValid(); child_die = child_die.GetSibling()) {
+          if (!strcmp(child_die.GetAttributeValueAsString(DW_AT_name, ""),
+                      "__FuncPtr")) {
+            DWARFDIE function_pointer_type =
+                child_die.GetReferencedDIE(DW_AT_type);
+
+            if (function_pointer_type) {
+              DWARFDIE function_type =
+                  function_pointer_type.GetReferencedDIE(DW_AT_type);
+
+              bool function_type_is_new_pointer;
+              TypeSP lldb_function_type_sp = ParseTypeFromDWARF(
+                  sc, function_type, &function_type_is_new_pointer);
+
+              if (lldb_function_type_sp) {
+                clang_type = m_ast.CreateBlockPointerType(
+                    lldb_function_type_sp->GetForwardCompilerType());
+                encoding_data_type = Type::eEncodingIsUID;
+                attrs.type.Clear();
+                resolve_state = Type::ResolveState::Full;
+              }
             }
+
+            break;
           }
         }
       }
+    }
 
-      if (cu_language == eLanguageTypeObjC ||
-          cu_language == eLanguageTypeObjC_plus_plus) {
-        if (attrs.name) {
-          static ConstString g_objc_type_name_id("id");
-          static ConstString g_objc_type_name_Class("Class");
-          static ConstString g_objc_type_name_selector("SEL");
+    if (cu_language == eLanguageTypeObjC ||
+        cu_language == eLanguageTypeObjC_plus_plus) {
+      if (attrs.name) {
+        if (attrs.name == "id") {
+          if (log)
+            dwarf->GetObjectFile()->GetModule()->LogMessage(
+                log,
+                "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
+                "is Objective-C 'id' built-in type.",
+                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+          clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
+          encoding_data_type = Type::eEncodingIsUID;
+          attrs.type.Clear();
+          resolve_state = Type::ResolveState::Full;
+        } else if (attrs.name == "Class") {
+          if (log)
+            dwarf->GetObjectFile()->GetModule()->LogMessage(
+                log,
+                "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
+                "is Objective-C 'Class' built-in type.",
+                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+          clang_type = m_ast.GetBasicType(eBasicTypeObjCClass);
+          encoding_data_type = Type::eEncodingIsUID;
+          attrs.type.Clear();
+          resolve_state = Type::ResolveState::Full;
+        } else if (attrs.name == "SEL") {
+          if (log)
+            dwarf->GetObjectFile()->GetModule()->LogMessage(
+                log,
+                "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
+                "is Objective-C 'selector' built-in type.",
+                die.GetOffset(), die.GetTagAsCString(), die.GetName());
+          clang_type = m_ast.GetBasicType(eBasicTypeObjCSel);
+          encoding_data_type = Type::eEncodingIsUID;
+          attrs.type.Clear();
+          resolve_state = Type::ResolveState::Full;
+        }
+      } else if (encoding_data_type == Type::eEncodingIsPointerUID &&
+                 attrs.type.IsValid()) {
+        // Clang sometimes erroneously emits id as objc_object*.  In that
+        // case we fix up the type to "id".
 
-          if (attrs.name == g_objc_type_name_id) {
-            if (log)
-              dwarf->GetObjectFile()->GetModule()->LogMessage(
-                  log,
-                  "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
-                  "is Objective-C 'id' built-in type.",
-                  die.GetOffset(), die.GetTagAsCString(), die.GetName());
-            clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
-            encoding_data_type = Type::eEncodingIsUID;
-            attrs.type.Clear();
-            resolve_state = Type::ResolveState::Full;
+        const DWARFDIE encoding_die = attrs.type.Reference();
 
-          } else if (attrs.name == g_objc_type_name_Class) {
+        if (encoding_die && encoding_die.Tag() == DW_TAG_structure_type) {
+          llvm::StringRef struct_name = encoding_die.GetName();
+          if (struct_name == "objc_object") {
             if (log)
               dwarf->GetObjectFile()->GetModule()->LogMessage(
                   log,
-                  "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
-                  "is Objective-C 'Class' built-in type.",
+                  "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s "
+                  "'%s' is 'objc_object*', which we overrode to "
+                  "'id'.",
                   die.GetOffset(), die.GetTagAsCString(), die.GetName());
-            clang_type = m_ast.GetBasicType(eBasicTypeObjCClass);
-            encoding_data_type = Type::eEncodingIsUID;
-            attrs.type.Clear();
-            resolve_state = Type::ResolveState::Full;
-          } else if (attrs.name == g_objc_type_name_selector) {
-            if (log)
-              dwarf->GetObjectFile()->GetModule()->LogMessage(
-                  log,
-                  "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s '%s' "
-                  "is Objective-C 'selector' built-in type.",
-                  die.GetOffset(), die.GetTagAsCString(), die.GetName());
-            clang_type = m_ast.GetBasicType(eBasicTypeObjCSel);
+            clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
             encoding_data_type = Type::eEncodingIsUID;
             attrs.type.Clear();
             resolve_state = Type::ResolveState::Full;
           }
-        } else if (encoding_data_type == Type::eEncodingIsPointerUID &&
-                   attrs.type.IsValid()) {
-          // Clang sometimes erroneously emits id as objc_object*.  In that
-          // case we fix up the type to "id".
-
-          const DWARFDIE encoding_die = attrs.type.Reference();
-
-          if (encoding_die && encoding_die.Tag() == DW_TAG_structure_type) {
-            if (const char *struct_name = encoding_die.GetName()) {
-              if (!strcmp(struct_name, "objc_object")) {
-                if (log)
-                  dwarf->GetObjectFile()->GetModule()->LogMessage(
-                      log,
-                      "SymbolFileDWARF::ParseType (die = 0x%8.8x) %s "
-                      "'%s' is 'objc_object*', which we overrode to "
-                      "'id'.",
-                      die.GetOffset(), die.GetTagAsCString(), die.GetName());
-                clang_type = m_ast.GetBasicType(eBasicTypeObjCID);
-                encoding_data_type = Type::eEncodingIsUID;
-                attrs.type.Clear();
-                resolve_state = Type::ResolveState::Full;
-              }
-            }
-          }
         }
       }
     }
-
-    type_sp = std::make_shared<Type>(
-        die.GetID(), dwarf, attrs.name, attrs.byte_size, nullptr,
-        dwarf->GetUID(attrs.type.Reference()), encoding_data_type, &attrs.decl,
-        clang_type, resolve_state);
-
-    dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-  } break;
-
-  case DW_TAG_structure_type:
-  case DW_TAG_union_type:
-  case DW_TAG_class_type: {
-    assert((!type_sp && !clang_type) &&
-           "Did not expect partially computed structure-like type");
-    TypeSP struct_like_type_sp = ParseStructureLikeDIE(sc, die, attrs);
-    return UpdateSymbolContextScopeForType(sc, die, struct_like_type_sp);
   }
 
-  case DW_TAG_enumeration_type: {
-    if (attrs.is_forward_declaration) {
-      type_sp = ParseTypeFromClangModule(sc, die, log);
-      if (type_sp)
-        return type_sp;
+  type_sp = std::make_shared<Type>(
+      die.GetID(), dwarf, attrs.name, attrs.byte_size, nullptr,
+      dwarf->GetUID(attrs.type.Reference()), encoding_data_type, &attrs.decl,
+      clang_type, resolve_state);
 
-      DWARFDeclContext die_decl_ctx;
-      die.GetDWARFDeclContext(die_decl_ctx);
+  dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
+  return type_sp;
+}
 
-      type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die_decl_ctx);
+TypeSP DWARFASTParserClang::ParseEnum(const SymbolContext &sc,
+                                      const DWARFDIE &die,
+                                      ParsedDWARFTypeAttributes &attrs) {
+  Log *log(LogChannelDWARF::GetLogIfAny(DWARF_LOG_TYPE_COMPLETION |
+                                        DWARF_LOG_LOOKUPS));
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  const dw_tag_t tag = die.Tag();
+  TypeSP type_sp;
 
-      if (!type_sp) {
-        SymbolFileDWARFDebugMap *debug_map_symfile =
-            dwarf->GetDebugMapSymfile();
-        if (debug_map_symfile) {
-          // We weren't able to find a full declaration in this DWARF,
-          // see if we have a declaration anywhere else...
-          type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(
-              die_decl_ctx);
-        }
-      }
+  if (attrs.is_forward_declaration) {
+    type_sp = ParseTypeFromClangModule(sc, die, log);
+    if (type_sp)
+      return type_sp;
 
-      if (type_sp) {
-        if (log) {
-          dwarf->GetObjectFile()->GetModule()->LogMessage(
-              log,
-              "SymbolFileDWARF(%p) - 0x%8.8x: %s type \"%s\" is a "
-              "forward declaration, complete type is 0x%8.8" PRIx64,
-              static_cast<void *>(this), die.GetOffset(),
-              DW_TAG_value_to_name(tag), attrs.name.GetCString(),
-              type_sp->GetID());
-        }
+    DWARFDeclContext die_decl_ctx;
+    die.GetDWARFDeclContext(die_decl_ctx);
 
-        // We found a real definition for this type elsewhere so lets use
-        // it and cache the fact that we found a complete type for this
-        // die
-        dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
-        clang::DeclContext *defn_decl_ctx =
-            GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
-        if (defn_decl_ctx)
-          LinkDeclContextToDIE(defn_decl_ctx, die);
-        return type_sp;
+    type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die_decl_ctx);
+
+    if (!type_sp) {
+      SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile();
+      if (debug_map_symfile) {
+        // We weren't able to find a full declaration in this DWARF,
+        // see if we have a declaration anywhere else...
+        type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(
+            die_decl_ctx);
       }
     }
-    DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
-                 DW_TAG_value_to_name(tag), type_name_cstr);
-
-    CompilerType enumerator_clang_type;
-    clang_type.SetCompilerType(
-        &m_ast, dwarf->GetForwardDeclDieToClangType().lookup(die.GetDIE()));
-    if (!clang_type) {
-      if (attrs.type.IsValid()) {
-        Type *enumerator_type =
-            dwarf->ResolveTypeUID(attrs.type.Reference(), true);
-        if (enumerator_type)
-          enumerator_clang_type = enumerator_type->GetFullCompilerType();
-      }
 
-      if (!enumerator_clang_type) {
-        if (attrs.byte_size) {
-          enumerator_clang_type =
-              m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
-                  NULL, DW_ATE_signed, *attrs.byte_size * 8);
-        } else {
-          enumerator_clang_type = m_ast.GetBasicType(eBasicTypeInt);
-        }
+    if (type_sp) {
+      if (log) {
+        dwarf->GetObjectFile()->GetModule()->LogMessage(
+            log,
+            "SymbolFileDWARF(%p) - 0x%8.8x: %s type \"%s\" is a "
+            "forward declaration, complete type is 0x%8.8" PRIx64,
+            static_cast<void *>(this), die.GetOffset(),
+            DW_TAG_value_to_name(tag), attrs.name.GetCString(),
+            type_sp->GetID());
       }
 
-      clang_type = m_ast.CreateEnumerationType(
-          attrs.name.GetCString(),
-          GetClangDeclContextContainingDIE(die, nullptr), attrs.decl,
-          enumerator_clang_type, attrs.is_scoped_enum);
-    } else {
-      enumerator_clang_type =
-          m_ast.GetEnumerationIntegerType(clang_type.GetOpaqueQualType());
+      // We found a real definition for this type elsewhere so lets use
+      // it and cache the fact that we found a complete type for this
+      // die
+      dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get();
+      clang::DeclContext *defn_decl_ctx =
+          GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID()));
+      if (defn_decl_ctx)
+        LinkDeclContextToDIE(defn_decl_ctx, die);
+      return type_sp;
     }
+  }
+  DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
+               DW_TAG_value_to_name(tag), type_name_cstr);
 
-    LinkDeclContextToDIE(ClangASTContext::GetDeclContextForType(clang_type),
-                         die);
-
-    type_sp = std::make_shared<Type>(
-        die.GetID(), dwarf, attrs.name, attrs.byte_size, nullptr,
-        dwarf->GetUID(attrs.type.Reference()), Type::eEncodingIsUID,
-        &attrs.decl, clang_type, Type::ResolveState::Forward);
+  CompilerType enumerator_clang_type;
+  CompilerType clang_type;
+  clang_type.SetCompilerType(
+      &m_ast, dwarf->GetForwardDeclDieToClangType().lookup(die.GetDIE()));
+  if (!clang_type) {
+    if (attrs.type.IsValid()) {
+      Type *enumerator_type =
+          dwarf->ResolveTypeUID(attrs.type.Reference(), true);
+      if (enumerator_type)
+        enumerator_clang_type = enumerator_type->GetFullCompilerType();
+    }
 
-    if (ClangASTContext::StartTagDeclarationDefinition(clang_type)) {
-      if (die.HasChildren()) {
-        bool is_signed = false;
-        enumerator_clang_type.IsIntegerType(is_signed);
-        ParseChildEnumerators(clang_type, is_signed,
-                              type_sp->GetByteSize().getValueOr(0), die);
+    if (!enumerator_clang_type) {
+      if (attrs.byte_size) {
+        enumerator_clang_type = m_ast.GetBuiltinTypeForDWARFEncodingAndBitSize(
+            NULL, DW_ATE_signed, *attrs.byte_size * 8);
+      } else {
+        enumerator_clang_type = m_ast.GetBasicType(eBasicTypeInt);
       }
-      ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
-    } else {
-      dwarf->GetObjectFile()->GetModule()->ReportError(
-          "DWARF DIE at 0x%8.8x named \"%s\" was not able to start its "
-          "definition.\nPlease file a bug and attach the file at the "
-          "start of this error message",
-          die.GetOffset(), attrs.name.GetCString());
     }
-  } break;
 
-  case DW_TAG_inlined_subroutine:
-  case DW_TAG_subprogram:
-  case DW_TAG_subroutine_type: {
-    bool is_variadic = false;
-    bool is_static = false;
-    bool has_template_params = false;
+    clang_type = m_ast.CreateEnumerationType(
+        attrs.name.GetCString(), GetClangDeclContextContainingDIE(die, nullptr),
+        attrs.decl, enumerator_clang_type, attrs.is_scoped_enum);
+  } else {
+    enumerator_clang_type =
+        m_ast.GetEnumerationIntegerType(clang_type.GetOpaqueQualType());
+  }
 
-    unsigned type_quals = 0;
+  LinkDeclContextToDIE(ClangASTContext::GetDeclContextForType(clang_type), die);
 
-    std::string object_pointer_name;
-    if (attrs.object_pointer) {
-      const char *object_pointer_name_cstr = attrs.object_pointer.GetName();
-      if (object_pointer_name_cstr)
-        object_pointer_name = object_pointer_name_cstr;
-    }
+  type_sp = std::make_shared<Type>(
+      die.GetID(), dwarf, attrs.name, attrs.byte_size, nullptr,
+      dwarf->GetUID(attrs.type.Reference()), Type::eEncodingIsUID, &attrs.decl,
+      clang_type, Type::ResolveState::Forward);
 
-    DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
-                 DW_TAG_value_to_name(tag), type_name_cstr);
+  if (ClangASTContext::StartTagDeclarationDefinition(clang_type)) {
+    if (die.HasChildren()) {
+      bool is_signed = false;
+      enumerator_clang_type.IsIntegerType(is_signed);
+      ParseChildEnumerators(clang_type, is_signed,
+                            type_sp->GetByteSize().getValueOr(0), die);
+    }
+    ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
+  } else {
+    dwarf->GetObjectFile()->GetModule()->ReportError(
+        "DWARF DIE at 0x%8.8x named \"%s\" was not able to start its "
+        "definition.\nPlease file a bug and attach the file at the "
+        "start of this error message",
+        die.GetOffset(), attrs.name.GetCString());
+  }
+  return type_sp;
+}
 
-    CompilerType return_clang_type;
-    Type *func_type = NULL;
+TypeSP DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
+                           ParsedDWARFTypeAttributes &attrs) {
+  Log *log(LogChannelDWARF::GetLogIfAny(DWARF_LOG_TYPE_COMPLETION |
+                                        DWARF_LOG_LOOKUPS));
 
-    if (attrs.type.IsValid())
-      func_type = dwarf->ResolveTypeUID(attrs.type.Reference(), true);
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  const dw_tag_t tag = die.Tag();
 
-    if (func_type)
-      return_clang_type = func_type->GetForwardCompilerType();
-    else
-      return_clang_type = m_ast.GetBasicType(eBasicTypeVoid);
+  bool is_variadic = false;
+  bool is_static = false;
+  bool has_template_params = false;
 
-    std::vector<CompilerType> function_param_types;
-    std::vector<clang::ParmVarDecl *> function_param_decls;
+  unsigned type_quals = 0;
 
-    // Parse the function children for the parameters
+  std::string object_pointer_name;
+  if (attrs.object_pointer) {
+    const char *object_pointer_name_cstr = attrs.object_pointer.GetName();
+    if (object_pointer_name_cstr)
+      object_pointer_name = object_pointer_name_cstr;
+  }
 
-    DWARFDIE decl_ctx_die;
-    clang::DeclContext *containing_decl_ctx =
-        GetClangDeclContextContainingDIE(die, &decl_ctx_die);
-    const clang::Decl::Kind containing_decl_kind =
-        containing_decl_ctx->getDeclKind();
+  DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
+               DW_TAG_value_to_name(tag), type_name_cstr);
 
-    bool is_cxx_method = DeclKindIsCXXClass(containing_decl_kind);
-    // Start off static. This will be set to false in
-    // ParseChildParameters(...) if we find a "this" parameters as the
-    // first parameter
-    if (is_cxx_method) {
-      is_static = true;
-    }
+  CompilerType return_clang_type;
+  Type *func_type = NULL;
+
+  if (attrs.type.IsValid())
+    func_type = dwarf->ResolveTypeUID(attrs.type.Reference(), true);
+
+  if (func_type)
+    return_clang_type = func_type->GetForwardCompilerType();
+  else
+    return_clang_type = m_ast.GetBasicType(eBasicTypeVoid);
+
+  std::vector<CompilerType> function_param_types;
+  std::vector<clang::ParmVarDecl *> function_param_decls;
+
+  // Parse the function children for the parameters
+
+  DWARFDIE decl_ctx_die;
+  clang::DeclContext *containing_decl_ctx =
+      GetClangDeclContextContainingDIE(die, &decl_ctx_die);
+  const clang::Decl::Kind containing_decl_kind =
+      containing_decl_ctx->getDeclKind();
+
+  bool is_cxx_method = DeclKindIsCXXClass(containing_decl_kind);
+  // Start off static. This will be set to false in
+  // ParseChildParameters(...) if we find a "this" parameters as the
+  // first parameter
+  if (is_cxx_method) {
+    is_static = true;
+  }
+
+  if (die.HasChildren()) {
+    bool skip_artificial = true;
+    ParseChildParameters(containing_decl_ctx, die, skip_artificial, is_static,
+                         is_variadic, has_template_params,
+                         function_param_types, function_param_decls,
+                         type_quals);
+  }
+
+  bool ignore_containing_context = false;
+  // Check for templatized class member functions. If we had any
+  // DW_TAG_template_type_parameter or DW_TAG_template_value_parameter
+  // the DW_TAG_subprogram DIE, then we can't let this become a method in
+  // a class. Why? Because templatized functions are only emitted if one
+  // of the templatized methods is used in the current compile unit and
+  // we will end up with classes that may or may not include these member
+  // functions and this means one class won't match another class
+  // definition and it affects our ability to use a class in the clang
+  // expression parser. So for the greater good, we currently must not
+  // allow any template member functions in a class definition.
+  if (is_cxx_method && has_template_params) {
+    ignore_containing_context = true;
+    is_cxx_method = false;
+  }
+
+  // clang_type will get the function prototype clang type after this
+  // call
+  CompilerType clang_type = m_ast.CreateFunctionType(
+      return_clang_type, function_param_types.data(),
+      function_param_types.size(), is_variadic, type_quals);
 
-    if (die.HasChildren()) {
-      bool skip_artificial = true;
-      ParseChildParameters(containing_decl_ctx, die, skip_artificial, is_static,
-                           is_variadic, has_template_params,
-                           function_param_types, function_param_decls,
-                           type_quals);
-    }
-
-    bool ignore_containing_context = false;
-    // Check for templatized class member functions. If we had any
-    // DW_TAG_template_type_parameter or DW_TAG_template_value_parameter
-    // the DW_TAG_subprogram DIE, then we can't let this become a method in
-    // a class. Why? Because templatized functions are only emitted if one
-    // of the templatized methods is used in the current compile unit and
-    // we will end up with classes that may or may not include these member
-    // functions and this means one class won't match another class
-    // definition and it affects our ability to use a class in the clang
-    // expression parser. So for the greater good, we currently must not
-    // allow any template member functions in a class definition.
-    if (is_cxx_method && has_template_params) {
-      ignore_containing_context = true;
-      is_cxx_method = false;
-    }
-
-    // clang_type will get the function prototype clang type after this
-    // call
-    clang_type = m_ast.CreateFunctionType(
-        return_clang_type, function_param_types.data(),
-        function_param_types.size(), is_variadic, type_quals);
-
-    if (attrs.name) {
-      bool type_handled = false;
-      if (tag == DW_TAG_subprogram || tag == DW_TAG_inlined_subroutine) {
-        ObjCLanguage::MethodName objc_method(attrs.name.GetStringRef(), true);
-        if (objc_method.IsValid(true)) {
-          CompilerType class_opaque_type;
-          ConstString class_name(objc_method.GetClassName());
-          if (class_name) {
-            TypeSP complete_objc_class_type_sp(
-                dwarf->FindCompleteObjCDefinitionTypeForDIE(DWARFDIE(),
-                                                            class_name, false));
-
-            if (complete_objc_class_type_sp) {
-              CompilerType type_clang_forward_type =
-                  complete_objc_class_type_sp->GetForwardCompilerType();
-              if (ClangASTContext::IsObjCObjectOrInterfaceType(
-                      type_clang_forward_type))
-                class_opaque_type = type_clang_forward_type;
-            }
+  if (attrs.name) {
+    bool type_handled = false;
+    if (tag == DW_TAG_subprogram || tag == DW_TAG_inlined_subroutine) {
+      ObjCLanguage::MethodName objc_method(attrs.name.GetStringRef(), true);
+      if (objc_method.IsValid(true)) {
+        CompilerType class_opaque_type;
+        ConstString class_name(objc_method.GetClassName());
+        if (class_name) {
+          TypeSP complete_objc_class_type_sp(
+              dwarf->FindCompleteObjCDefinitionTypeForDIE(DWARFDIE(),
+                                                          class_name, false));
+
+          if (complete_objc_class_type_sp) {
+            CompilerType type_clang_forward_type =
+                complete_objc_class_type_sp->GetForwardCompilerType();
+            if (ClangASTContext::IsObjCObjectOrInterfaceType(
+                    type_clang_forward_type))
+              class_opaque_type = type_clang_forward_type;
           }
+        }
 
-          if (class_opaque_type) {
-            // If accessibility isn't set to anything valid, assume public
-            // for now...
-            if (attrs.accessibility == eAccessNone)
-              attrs.accessibility = eAccessPublic;
-
-            clang::ObjCMethodDecl *objc_method_decl =
-                m_ast.AddMethodToObjCObjectType(
-                    class_opaque_type, attrs.name.GetCString(), clang_type,
-                    attrs.accessibility, attrs.is_artificial, is_variadic);
-            type_handled = objc_method_decl != NULL;
-            if (type_handled) {
-              LinkDeclContextToDIE(objc_method_decl, die);
-              m_ast.SetMetadataAsUserID(objc_method_decl, die.GetID());
-            } else {
-              dwarf->GetObjectFile()->GetModule()->ReportError(
-                  "{0x%8.8x}: invalid Objective-C method 0x%4.4x (%s), "
-                  "please file a bug and attach the file at the start of "
-                  "this error message",
-                  die.GetOffset(), tag, DW_TAG_value_to_name(tag));
-            }
+        if (class_opaque_type) {
+          // If accessibility isn't set to anything valid, assume public
+          // for now...
+          if (attrs.accessibility == eAccessNone)
+            attrs.accessibility = eAccessPublic;
+
+          clang::ObjCMethodDecl *objc_method_decl =
+              m_ast.AddMethodToObjCObjectType(
+                  class_opaque_type, attrs.name.GetCString(), clang_type,
+                  attrs.accessibility, attrs.is_artificial, is_variadic);
+          type_handled = objc_method_decl != NULL;
+          if (type_handled) {
+            LinkDeclContextToDIE(objc_method_decl, die);
+            m_ast.SetMetadataAsUserID(objc_method_decl, die.GetID());
+          } else {
+            dwarf->GetObjectFile()->GetModule()->ReportError(
+                "{0x%8.8x}: invalid Objective-C method 0x%4.4x (%s), "
+                "please file a bug and attach the file at the start of "
+                "this error message",
+                die.GetOffset(), tag, DW_TAG_value_to_name(tag));
           }
-        } else if (is_cxx_method) {
-          // Look at the parent of this DIE and see if is is a class or
-          // struct and see if this is actually a C++ method
-          Type *class_type = dwarf->ResolveType(decl_ctx_die);
-          if (class_type) {
-            bool alternate_defn = false;
-            if (class_type->GetID() != decl_ctx_die.GetID() ||
-                IsClangModuleFwdDecl(decl_ctx_die)) {
-              alternate_defn = true;
-
-              // We uniqued the parent class of this function to another
-              // class so we now need to associate all dies under
-              // "decl_ctx_die" to DIEs in the DIE for "class_type"...
-              DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
-
-              if (class_type_die) {
-                std::vector<DWARFDIE> failures;
-
-                CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
-                                           class_type, failures);
-
-                // FIXME do something with these failures that's
-                // smarter than just dropping them on the ground.
-                // Unfortunately classes don't like having stuff added
-                // to them after their definitions are complete...
-
-                type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-                if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
-                  type_sp = type_ptr->shared_from_this();
-                  break;
-                }
+        }
+      } else if (is_cxx_method) {
+        // Look at the parent of this DIE and see if is is a class or
+        // struct and see if this is actually a C++ method
+        Type *class_type = dwarf->ResolveType(decl_ctx_die);
+        if (class_type) {
+          bool alternate_defn = false;
+          if (class_type->GetID() != decl_ctx_die.GetID() ||
+              IsClangModuleFwdDecl(decl_ctx_die)) {
+            alternate_defn = true;
+
+            // We uniqued the parent class of this function to another
+            // class so we now need to associate all dies under
+            // "decl_ctx_die" to DIEs in the DIE for "class_type"...
+            DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID());
+
+            if (class_type_die) {
+              std::vector<DWARFDIE> failures;
+
+              CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die,
+                                         class_type, failures);
+
+              // FIXME do something with these failures that's
+              // smarter than just dropping them on the ground.
+              // Unfortunately classes don't like having stuff added
+              // to them after their definitions are complete...
+
+              Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+              if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
+                return type_ptr->shared_from_this();
               }
             }
+          }
 
-            if (attrs.specification.IsValid()) {
-              // We have a specification which we are going to base our
-              // function prototype off of, so we need this type to be
-              // completed so that the m_die_to_decl_ctx for the method in
-              // the specification has a valid clang decl context.
-              class_type->GetForwardCompilerType();
-              // If we have a specification, then the function type should
-              // have been made with the specification and not with this
-              // die.
-              DWARFDIE spec_die = attrs.specification.Reference();
-              clang::DeclContext *spec_clang_decl_ctx =
-                  GetClangDeclContextForDIE(spec_die);
-              if (spec_clang_decl_ctx) {
-                LinkDeclContextToDIE(spec_clang_decl_ctx, die);
-              } else {
-                dwarf->GetObjectFile()->GetModule()->ReportWarning(
-                    "0x%8.8" PRIx64 ": DW_AT_specification(0x%8.8x"
-                    ") has no decl\n",
-                    die.GetID(), spec_die.GetOffset());
-              }
-              type_handled = true;
-            } else if (attrs.abstract_origin.IsValid()) {
-              // We have a specification which we are going to base our
-              // function prototype off of, so we need this type to be
-              // completed so that the m_die_to_decl_ctx for the method in
-              // the abstract origin has a valid clang decl context.
-              class_type->GetForwardCompilerType();
-
-              DWARFDIE abs_die = attrs.abstract_origin.Reference();
-              clang::DeclContext *abs_clang_decl_ctx =
-                  GetClangDeclContextForDIE(abs_die);
-              if (abs_clang_decl_ctx) {
-                LinkDeclContextToDIE(abs_clang_decl_ctx, die);
-              } else {
-                dwarf->GetObjectFile()->GetModule()->ReportWarning(
-                    "0x%8.8" PRIx64 ": DW_AT_abstract_origin(0x%8.8x"
-                    ") has no decl\n",
-                    die.GetID(), abs_die.GetOffset());
-              }
-              type_handled = true;
+          if (attrs.specification.IsValid()) {
+            // We have a specification which we are going to base our
+            // function prototype off of, so we need this type to be
+            // completed so that the m_die_to_decl_ctx for the method in
+            // the specification has a valid clang decl context.
+            class_type->GetForwardCompilerType();
+            // If we have a specification, then the function type should
+            // have been made with the specification and not with this
+            // die.
+            DWARFDIE spec_die = attrs.specification.Reference();
+            clang::DeclContext *spec_clang_decl_ctx =
+                GetClangDeclContextForDIE(spec_die);
+            if (spec_clang_decl_ctx) {
+              LinkDeclContextToDIE(spec_clang_decl_ctx, die);
+            } else {
+              dwarf->GetObjectFile()->GetModule()->ReportWarning(
+                  "0x%8.8" PRIx64 ": DW_AT_specification(0x%8.8x"
+                  ") has no decl\n",
+                  die.GetID(), spec_die.GetOffset());
+            }
+            type_handled = true;
+          } else if (attrs.abstract_origin.IsValid()) {
+            // We have a specification which we are going to base our
+            // function prototype off of, so we need this type to be
+            // completed so that the m_die_to_decl_ctx for the method in
+            // the abstract origin has a valid clang decl context.
+            class_type->GetForwardCompilerType();
+
+            DWARFDIE abs_die = attrs.abstract_origin.Reference();
+            clang::DeclContext *abs_clang_decl_ctx =
+                GetClangDeclContextForDIE(abs_die);
+            if (abs_clang_decl_ctx) {
+              LinkDeclContextToDIE(abs_clang_decl_ctx, die);
             } else {
-              CompilerType class_opaque_type =
-                  class_type->GetForwardCompilerType();
-              if (ClangASTContext::IsCXXClassType(class_opaque_type)) {
-                if (class_opaque_type.IsBeingDefined() || alternate_defn) {
-                  if (!is_static && !die.HasChildren()) {
-                    // We have a C++ member function with no children (this
-                    // pointer!) and clang will get mad if we try and make
-                    // a function that isn't well formed in the DWARF, so
-                    // we will just skip it...
-                    type_handled = true;
-                  } else {
-                    bool add_method = true;
-                    if (alternate_defn) {
-                      // If an alternate definition for the class exists,
-                      // then add the method only if an equivalent is not
-                      // already present.
-                      clang::CXXRecordDecl *record_decl =
-                          m_ast.GetAsCXXRecordDecl(
-                              class_opaque_type.GetOpaqueQualType());
-                      if (record_decl) {
-                        for (auto method_iter = record_decl->method_begin();
-                             method_iter != record_decl->method_end();
-                             method_iter++) {
-                          clang::CXXMethodDecl *method_decl = *method_iter;
-                          if (method_decl->getNameInfo().getAsString() ==
-                              attrs.name.GetStringRef()) {
-                            if (method_decl->getType() ==
-                                ClangUtil::GetQualType(clang_type)) {
-                              add_method = false;
-                              LinkDeclContextToDIE(method_decl, die);
-                              type_handled = true;
-
-                              break;
-                            }
+              dwarf->GetObjectFile()->GetModule()->ReportWarning(
+                  "0x%8.8" PRIx64 ": DW_AT_abstract_origin(0x%8.8x"
+                  ") has no decl\n",
+                  die.GetID(), abs_die.GetOffset());
+            }
+            type_handled = true;
+          } else {
+            CompilerType class_opaque_type =
+                class_type->GetForwardCompilerType();
+            if (ClangASTContext::IsCXXClassType(class_opaque_type)) {
+              if (class_opaque_type.IsBeingDefined() || alternate_defn) {
+                if (!is_static && !die.HasChildren()) {
+                  // We have a C++ member function with no children (this
+                  // pointer!) and clang will get mad if we try and make
+                  // a function that isn't well formed in the DWARF, so
+                  // we will just skip it...
+                  type_handled = true;
+                } else {
+                  bool add_method = true;
+                  if (alternate_defn) {
+                    // If an alternate definition for the class exists,
+                    // then add the method only if an equivalent is not
+                    // already present.
+                    clang::CXXRecordDecl *record_decl =
+                        m_ast.GetAsCXXRecordDecl(
+                            class_opaque_type.GetOpaqueQualType());
+                    if (record_decl) {
+                      for (auto method_iter = record_decl->method_begin();
+                           method_iter != record_decl->method_end();
+                           method_iter++) {
+                        clang::CXXMethodDecl *method_decl = *method_iter;
+                        if (method_decl->getNameInfo().getAsString() ==
+                            attrs.name.GetStringRef()) {
+                          if (method_decl->getType() ==
+                              ClangUtil::GetQualType(clang_type)) {
+                            add_method = false;
+                            LinkDeclContextToDIE(method_decl, die);
+                            type_handled = true;
+
+                            break;
                           }
                         }
                       }
                     }
+                  }
 
-                    if (add_method) {
-                      llvm::PrettyStackTraceFormat stack_trace(
-                          "SymbolFileDWARF::ParseType() is adding a method "
-                          "%s to class %s in DIE 0x%8.8" PRIx64 " from %s",
-                          attrs.name.GetCString(),
-                          class_type->GetName().GetCString(), die.GetID(),
-                          dwarf->GetObjectFile()
-                              ->GetFileSpec()
-                              .GetPath()
-                              .c_str());
-
-                      const bool is_attr_used = false;
-                      // Neither GCC 4.2 nor clang++ currently set a valid
-                      // accessibility in the DWARF for C++ methods...
-                      // Default to public for now...
-                      if (attrs.accessibility == eAccessNone)
-                        attrs.accessibility = eAccessPublic;
-
-                      clang::CXXMethodDecl *cxx_method_decl =
-                          m_ast.AddMethodToCXXRecordType(
-                              class_opaque_type.GetOpaqueQualType(),
-                              attrs.name.GetCString(), attrs.mangled_name,
-                              clang_type, attrs.accessibility, attrs.is_virtual,
-                              is_static, attrs.is_inline, attrs.is_explicit,
-                              is_attr_used, attrs.is_artificial);
-
-                      type_handled = cxx_method_decl != NULL;
-                      // Artificial methods are always handled even when we
-                      // don't create a new declaration for them.
-                      type_handled |= attrs.is_artificial;
-
-                      if (cxx_method_decl) {
-                        LinkDeclContextToDIE(cxx_method_decl, die);
-
-                        ClangASTMetadata metadata;
-                        metadata.SetUserID(die.GetID());
-
-                        if (!object_pointer_name.empty()) {
-                          metadata.SetObjectPtrName(
-                              object_pointer_name.c_str());
-                          LLDB_LOGF(log,
-                                    "Setting object pointer name: %s on method "
-                                    "object %p.\n",
-                                    object_pointer_name.c_str(),
-                                    static_cast<void *>(cxx_method_decl));
-                        }
-                        m_ast.SetMetadata(cxx_method_decl, metadata);
-                      } else {
-                        ignore_containing_context = true;
+                  if (add_method) {
+                    llvm::PrettyStackTraceFormat stack_trace(
+                        "SymbolFileDWARF::ParseType() is adding a method "
+                        "%s to class %s in DIE 0x%8.8" PRIx64 " from %s",
+                        attrs.name.GetCString(),
+                        class_type->GetName().GetCString(), die.GetID(),
+                        dwarf->GetObjectFile()
+                            ->GetFileSpec()
+                            .GetPath()
+                            .c_str());
+
+                    const bool is_attr_used = false;
+                    // Neither GCC 4.2 nor clang++ currently set a valid
+                    // accessibility in the DWARF for C++ methods...
+                    // Default to public for now...
+                    if (attrs.accessibility == eAccessNone)
+                      attrs.accessibility = eAccessPublic;
+
+                    clang::CXXMethodDecl *cxx_method_decl =
+                        m_ast.AddMethodToCXXRecordType(
+                            class_opaque_type.GetOpaqueQualType(),
+                            attrs.name.GetCString(), attrs.mangled_name,
+                            clang_type, attrs.accessibility, attrs.is_virtual,
+                            is_static, attrs.is_inline, attrs.is_explicit,
+                            is_attr_used, attrs.is_artificial);
+
+                    type_handled = cxx_method_decl != NULL;
+                    // Artificial methods are always handled even when we
+                    // don't create a new declaration for them.
+                    type_handled |= attrs.is_artificial;
+
+                    if (cxx_method_decl) {
+                      LinkDeclContextToDIE(cxx_method_decl, die);
+
+                      ClangASTMetadata metadata;
+                      metadata.SetUserID(die.GetID());
+
+                      if (!object_pointer_name.empty()) {
+                        metadata.SetObjectPtrName(
+                            object_pointer_name.c_str());
+                        LLDB_LOGF(log,
+                                  "Setting object pointer name: %s on method "
+                                  "object %p.\n",
+                                  object_pointer_name.c_str(),
+                                  static_cast<void *>(cxx_method_decl));
                       }
+                      m_ast.SetMetadata(cxx_method_decl, metadata);
+                    } else {
+                      ignore_containing_context = true;
                     }
                   }
-                } else {
-                  // We were asked to parse the type for a method in a
-                  // class, yet the class hasn't been asked to complete
-                  // itself through the clang::ExternalASTSource protocol,
-                  // so we need to just have the class complete itself and
-                  // do things the right way, then our
-                  // DIE should then have an entry in the
-                  // dwarf->GetDIEToType() map. First
-                  // we need to modify the dwarf->GetDIEToType() so it
-                  // doesn't think we are trying to parse this DIE
-                  // anymore...
-                  dwarf->GetDIEToType()[die.GetDIE()] = NULL;
-
-                  // Now we get the full type to force our class type to
-                  // complete itself using the clang::ExternalASTSource
-                  // protocol which will parse all base classes and all
-                  // methods (including the method for this DIE).
-                  class_type->GetFullCompilerType();
-
-                  // The type for this DIE should have been filled in the
-                  // function call above
-                  type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
-                  if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
-                    type_sp = type_ptr->shared_from_this();
-                    break;
-                  }
-
-                  // FIXME This is fixing some even uglier behavior but we
-                  // really need to
-                  // uniq the methods of each class as well as the class
-                  // itself. <rdar://problem/11240464>
-                  type_handled = true;
                 }
+              } else {
+                // We were asked to parse the type for a method in a
+                // class, yet the class hasn't been asked to complete
+                // itself through the clang::ExternalASTSource protocol,
+                // so we need to just have the class complete itself and
+                // do things the right way, then our
+                // DIE should then have an entry in the
+                // dwarf->GetDIEToType() map. First
+                // we need to modify the dwarf->GetDIEToType() so it
+                // doesn't think we are trying to parse this DIE
+                // anymore...
+                dwarf->GetDIEToType()[die.GetDIE()] = NULL;
+
+                // Now we get the full type to force our class type to
+                // complete itself using the clang::ExternalASTSource
+                // protocol which will parse all base classes and all
+                // methods (including the method for this DIE).
+                class_type->GetFullCompilerType();
+
+                // The type for this DIE should have been filled in the
+                // function call above
+                Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()];
+                if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) {
+                  return type_ptr->shared_from_this();
+                }
+
+                // FIXME This is fixing some even uglier behavior but we
+                // really need to
+                // uniq the methods of each class as well as the class
+                // itself. <rdar://problem/11240464>
+                type_handled = true;
               }
             }
           }
         }
       }
+    }
 
-      if (!type_handled) {
-        clang::FunctionDecl *function_decl = nullptr;
-        clang::FunctionDecl *template_function_decl = nullptr;
+    if (!type_handled) {
+      clang::FunctionDecl *function_decl = nullptr;
+      clang::FunctionDecl *template_function_decl = nullptr;
 
-        if (attrs.abstract_origin.IsValid()) {
-          DWARFDIE abs_die = attrs.abstract_origin.Reference();
+      if (attrs.abstract_origin.IsValid()) {
+        DWARFDIE abs_die = attrs.abstract_origin.Reference();
 
-          if (dwarf->ResolveType(abs_die)) {
-            function_decl = llvm::dyn_cast_or_null<clang::FunctionDecl>(
-                GetCachedClangDeclContextForDIE(abs_die));
+        if (dwarf->ResolveType(abs_die)) {
+          function_decl = llvm::dyn_cast_or_null<clang::FunctionDecl>(
+              GetCachedClangDeclContextForDIE(abs_die));
 
-            if (function_decl) {
-              LinkDeclContextToDIE(function_decl, die);
-            }
+          if (function_decl) {
+            LinkDeclContextToDIE(function_decl, die);
           }
         }
+      }
 
-        if (!function_decl) {
-          // We just have a function that isn't part of a class
-          function_decl = m_ast.CreateFunctionDeclaration(
+      if (!function_decl) {
+        // We just have a function that isn't part of a class
+        function_decl = m_ast.CreateFunctionDeclaration(
+            ignore_containing_context ? m_ast.GetTranslationUnitDecl()
+                                      : containing_decl_ctx,
+            attrs.name.GetCString(), clang_type, attrs.storage,
+            attrs.is_inline);
+
+        if (has_template_params) {
+          ClangASTContext::TemplateParameterInfos template_param_infos;
+          ParseTemplateParameterInfos(die, template_param_infos);
+          template_function_decl = m_ast.CreateFunctionDeclaration(
               ignore_containing_context ? m_ast.GetTranslationUnitDecl()
                                         : containing_decl_ctx,
               attrs.name.GetCString(), clang_type, attrs.storage,
               attrs.is_inline);
+          clang::FunctionTemplateDecl *func_template_decl =
+              m_ast.CreateFunctionTemplateDecl(
+                  containing_decl_ctx, template_function_decl,
+                  attrs.name.GetCString(), template_param_infos);
+          m_ast.CreateFunctionTemplateSpecializationInfo(
+              function_decl, func_template_decl, template_param_infos);
+        }
 
-          if (has_template_params) {
-            ClangASTContext::TemplateParameterInfos template_param_infos;
-            ParseTemplateParameterInfos(die, template_param_infos);
-            template_function_decl = m_ast.CreateFunctionDeclaration(
-                ignore_containing_context ? m_ast.GetTranslationUnitDecl()
-                                          : containing_decl_ctx,
-                attrs.name.GetCString(), clang_type, attrs.storage,
-                attrs.is_inline);
-            clang::FunctionTemplateDecl *func_template_decl =
-                m_ast.CreateFunctionTemplateDecl(
-                    containing_decl_ctx, template_function_decl,
-                    attrs.name.GetCString(), template_param_infos);
-            m_ast.CreateFunctionTemplateSpecializationInfo(
-                function_decl, func_template_decl, template_param_infos);
-          }
+        lldbassert(function_decl);
 
-          lldbassert(function_decl);
+        if (function_decl) {
+          LinkDeclContextToDIE(function_decl, die);
 
-          if (function_decl) {
-            LinkDeclContextToDIE(function_decl, die);
-
-            if (!function_param_decls.empty()) {
-              m_ast.SetFunctionParameters(function_decl,
+          if (!function_param_decls.empty()) {
+            m_ast.SetFunctionParameters(function_decl,
+                                        &function_param_decls.front(),
+                                        function_param_decls.size());
+            if (template_function_decl)
+              m_ast.SetFunctionParameters(template_function_decl,
                                           &function_param_decls.front(),
                                           function_param_decls.size());
-              if (template_function_decl)
-                m_ast.SetFunctionParameters(template_function_decl,
-                                            &function_param_decls.front(),
-                                            function_param_decls.size());
-            }
+          }
 
-            ClangASTMetadata metadata;
-            metadata.SetUserID(die.GetID());
+          ClangASTMetadata metadata;
+          metadata.SetUserID(die.GetID());
 
-            if (!object_pointer_name.empty()) {
-              metadata.SetObjectPtrName(object_pointer_name.c_str());
-              LLDB_LOGF(log,
-                        "Setting object pointer name: %s on function "
-                        "object %p.",
-                        object_pointer_name.c_str(),
-                        static_cast<void *>(function_decl));
-            }
-            m_ast.SetMetadata(function_decl, metadata);
+          if (!object_pointer_name.empty()) {
+            metadata.SetObjectPtrName(object_pointer_name.c_str());
+            LLDB_LOGF(log,
+                      "Setting object pointer name: %s on function "
+                      "object %p.",
+                      object_pointer_name.c_str(),
+                      static_cast<void *>(function_decl));
           }
+          m_ast.SetMetadata(function_decl, metadata);
         }
       }
     }
-    type_sp = std::make_shared<Type>(
-        die.GetID(), dwarf, attrs.name, llvm::None, nullptr, LLDB_INVALID_UID,
-        Type::eEncodingIsUID, &attrs.decl, clang_type, Type::ResolveState::Full);
-    assert(type_sp.get());
-  } break;
+  }
+  return std::make_shared<Type>(
+      die.GetID(), dwarf, attrs.name, llvm::None, nullptr, LLDB_INVALID_UID,
+      Type::eEncodingIsUID, &attrs.decl, clang_type, Type::ResolveState::Full);
+}
 
-  case DW_TAG_array_type: {
-    DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
-                 DW_TAG_value_to_name(tag), type_name_cstr);
+TypeSP DWARFASTParserClang::ParseArrayType(const DWARFDIE &die,
+                                           ParsedDWARFTypeAttributes &attrs) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
 
-    DWARFDIE type_die = attrs.type.Reference();
-    Type *element_type = dwarf->ResolveTypeUID(type_die, true);
+  DEBUG_PRINTF("0x%8.8" PRIx64 ": %s (\"%s\")\n", die.GetID(),
+               DW_TAG_value_to_name(tag), type_name_cstr);
 
-    if (element_type) {
-      auto array_info = ParseChildArrayInfo(die);
-      if (array_info) {
-        attrs.byte_stride = array_info->byte_stride;
-        attrs.bit_stride = array_info->bit_stride;
-      }
-      if (attrs.byte_stride == 0 && attrs.bit_stride == 0)
-        attrs.byte_stride = element_type->GetByteSize().getValueOr(0);
-      CompilerType array_element_type = element_type->GetForwardCompilerType();
-
-      if (ClangASTContext::IsCXXClassType(array_element_type) &&
-          !array_element_type.GetCompleteType()) {
-        ModuleSP module_sp = die.GetModule();
-        if (module_sp) {
-          if (die.GetCU()->GetProducer() == eProducerClang)
-            module_sp->ReportError(
-                "DWARF DW_TAG_array_type DIE at 0x%8.8x has a "
-                "class/union/struct element type DIE 0x%8.8x that is a "
-                "forward declaration, not a complete definition.\nTry "
-                "compiling the source file with -fstandalone-debug or "
-                "disable -gmodules",
-                die.GetOffset(), type_die.GetOffset());
-          else
-            module_sp->ReportError(
-                "DWARF DW_TAG_array_type DIE at 0x%8.8x has a "
-                "class/union/struct element type DIE 0x%8.8x that is a "
-                "forward declaration, not a complete definition.\nPlease "
-                "file a bug against the compiler and include the "
-                "preprocessed output for %s",
-                die.GetOffset(), type_die.GetOffset(),
-                GetUnitName(die).c_str());
-        }
+  DWARFDIE type_die = attrs.type.Reference();
+  Type *element_type = dwarf->ResolveTypeUID(type_die, true);
 
-        // We have no choice other than to pretend that the element class
-        // type is complete. If we don't do this, clang will crash when
-        // trying to layout the class. Since we provide layout
-        // assistance, all ivars in this class and other classes will be
-        // fine, this is the best we can do short of crashing.
-        if (ClangASTContext::StartTagDeclarationDefinition(
-                array_element_type)) {
-          ClangASTContext::CompleteTagDeclarationDefinition(array_element_type);
-        } else {
-          module_sp->ReportError("DWARF DIE at 0x%8.8x was not able to "
-                                 "start its definition.\nPlease file a "
-                                 "bug and attach the file at the start "
-                                 "of this error message",
-                                 type_die.GetOffset());
-        }
-      }
+  if (!element_type)
+    return nullptr;
 
-      uint64_t array_element_bit_stride =
-          attrs.byte_stride * 8 + attrs.bit_stride;
-      if (array_info && array_info->element_orders.size() > 0) {
-        uint64_t num_elements = 0;
-        auto end = array_info->element_orders.rend();
-        for (auto pos = array_info->element_orders.rbegin(); pos != end;
-             ++pos) {
-          num_elements = *pos;
-          clang_type = m_ast.CreateArrayType(array_element_type, num_elements,
-                                             attrs.is_vector);
-          array_element_type = clang_type;
-          array_element_bit_stride =
-              num_elements ? array_element_bit_stride * num_elements
-                           : array_element_bit_stride;
-        }
-      } else {
-        clang_type = m_ast.CreateArrayType(array_element_type, 0, attrs.is_vector);
-      }
-      ConstString empty_name;
-      type_sp = std::make_shared<Type>(
-          die.GetID(), dwarf, empty_name, array_element_bit_stride / 8, nullptr,
-          dwarf->GetUID(type_die), Type::eEncodingIsUID, &attrs.decl,
-          clang_type, Type::ResolveState::Full);
-      type_sp->SetEncodingType(element_type);
-      m_ast.SetMetadataAsUserID(clang_type.GetOpaqueQualType(), die.GetID());
+  llvm::Optional<SymbolFile::ArrayInfo> array_info = ParseChildArrayInfo(die);
+  if (array_info) {
+    attrs.byte_stride = array_info->byte_stride;
+    attrs.bit_stride = array_info->bit_stride;
+  }
+  if (attrs.byte_stride == 0 && attrs.bit_stride == 0)
+    attrs.byte_stride = element_type->GetByteSize().getValueOr(0);
+  CompilerType array_element_type = element_type->GetForwardCompilerType();
+
+  if (ClangASTContext::IsCXXClassType(array_element_type) &&
+      !array_element_type.GetCompleteType()) {
+    ModuleSP module_sp = die.GetModule();
+    if (module_sp) {
+      if (die.GetCU()->GetProducer() == eProducerClang)
+        module_sp->ReportError(
+            "DWARF DW_TAG_array_type DIE at 0x%8.8x has a "
+            "class/union/struct element type DIE 0x%8.8x that is a "
+            "forward declaration, not a complete definition.\nTry "
+            "compiling the source file with -fstandalone-debug or "
+            "disable -gmodules",
+            die.GetOffset(), type_die.GetOffset());
+      else
+        module_sp->ReportError(
+            "DWARF DW_TAG_array_type DIE at 0x%8.8x has a "
+            "class/union/struct element type DIE 0x%8.8x that is a "
+            "forward declaration, not a complete definition.\nPlease "
+            "file a bug against the compiler and include the "
+            "preprocessed output for %s",
+            die.GetOffset(), type_die.GetOffset(), GetUnitName(die).c_str());
+    }
+
+    // We have no choice other than to pretend that the element class
+    // type is complete. If we don't do this, clang will crash when
+    // trying to layout the class. Since we provide layout
+    // assistance, all ivars in this class and other classes will be
+    // fine, this is the best we can do short of crashing.
+    if (ClangASTContext::StartTagDeclarationDefinition(array_element_type)) {
+      ClangASTContext::CompleteTagDeclarationDefinition(array_element_type);
+    } else {
+      module_sp->ReportError("DWARF DIE at 0x%8.8x was not able to "
+                             "start its definition.\nPlease file a "
+                             "bug and attach the file at the start "
+                             "of this error message",
+                             type_die.GetOffset());
     }
-  } break;
+  }
 
-  case DW_TAG_ptr_to_member_type: {
-    Type *pointee_type = dwarf->ResolveTypeUID(attrs.type.Reference(), true);
-    Type *class_type =
-        dwarf->ResolveTypeUID(attrs.containing_type.Reference(), true);
+  uint64_t array_element_bit_stride =
+      attrs.byte_stride * 8 + attrs.bit_stride;
+  CompilerType clang_type;
+  if (array_info && array_info->element_orders.size() > 0) {
+    uint64_t num_elements = 0;
+    auto end = array_info->element_orders.rend();
+    for (auto pos = array_info->element_orders.rbegin(); pos != end; ++pos) {
+      num_elements = *pos;
+      clang_type = m_ast.CreateArrayType(array_element_type, num_elements,
+                                         attrs.is_vector);
+      array_element_type = clang_type;
+      array_element_bit_stride = num_elements
+                                     ? array_element_bit_stride * num_elements
+                                     : array_element_bit_stride;
+    }
+  } else {
+    clang_type =
+        m_ast.CreateArrayType(array_element_type, 0, attrs.is_vector);
+  }
+  ConstString empty_name;
+  TypeSP type_sp = std::make_shared<Type>(
+      die.GetID(), dwarf, empty_name, array_element_bit_stride / 8, nullptr,
+      dwarf->GetUID(type_die), Type::eEncodingIsUID, &attrs.decl, clang_type,
+      Type::ResolveState::Full);
+  type_sp->SetEncodingType(element_type);
+  m_ast.SetMetadataAsUserID(clang_type.GetOpaqueQualType(), die.GetID());
+  return type_sp;
+}
 
-    CompilerType pointee_clang_type = pointee_type->GetForwardCompilerType();
-    CompilerType class_clang_type = class_type->GetLayoutCompilerType();
+TypeSP DWARFASTParserClang::ParsePointerToMemberType(
+    const DWARFDIE &die, const ParsedDWARFTypeAttributes &attrs) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+  Type *pointee_type = dwarf->ResolveTypeUID(attrs.type.Reference(), true);
+  Type *class_type =
+      dwarf->ResolveTypeUID(attrs.containing_type.Reference(), true);
 
-    clang_type = ClangASTContext::CreateMemberPointerType(class_clang_type,
-                                                          pointee_clang_type);
+  CompilerType pointee_clang_type = pointee_type->GetForwardCompilerType();
+  CompilerType class_clang_type = class_type->GetLayoutCompilerType();
 
-    if (llvm::Optional<uint64_t> clang_type_size =
-            clang_type.GetByteSize(nullptr)) {
-      type_sp = std::make_shared<Type>(
-          die.GetID(), dwarf, attrs.name, *clang_type_size, nullptr,
-          LLDB_INVALID_UID, Type::eEncodingIsUID, nullptr, clang_type,
-          Type::ResolveState::Forward);
-    }
+  CompilerType clang_type = ClangASTContext::CreateMemberPointerType(
+      class_clang_type, pointee_clang_type);
 
-    break;
+  if (llvm::Optional<uint64_t> clang_type_size =
+          clang_type.GetByteSize(nullptr)) {
+    return std::make_shared<Type>(die.GetID(), dwarf, attrs.name,
+                                  *clang_type_size, nullptr, LLDB_INVALID_UID,
+                                  Type::eEncodingIsUID, nullptr, clang_type,
+                                  Type::ResolveState::Forward);
   }
-  default:
-    dwarf->GetObjectFile()->GetModule()->ReportError(
-        "{0x%8.8x}: unhandled type tag 0x%4.4x (%s), please file a bug and "
-        "attach the file at the start of this error message",
-        die.GetOffset(), tag, DW_TAG_value_to_name(tag));
-    break;
-  }
-
-  // TODO: We should consider making the switch above exhaustive to simplify
-  // control flow in ParseTypeFromDWARF. Then, we could simply replace this
-  // return statement with a call to llvm_unreachable.
-  return UpdateSymbolContextScopeForType(sc, die, type_sp);
+  return nullptr;
 }
 
 TypeSP DWARFASTParserClang::UpdateSymbolContextScopeForType(
@@ -1330,20 +1369,20 @@ TypeSP DWARFASTParserClang::UpdateSymbolContextScopeForType(
   DWARFDIE sc_parent_die = SymbolFileDWARF::GetParentSymbolContextDIE(die);
   dw_tag_t sc_parent_tag = sc_parent_die.Tag();
 
-  SymbolContextScope *symbol_context_scope = NULL;
+  SymbolContextScope *symbol_context_scope = nullptr;
   if (sc_parent_tag == DW_TAG_compile_unit ||
       sc_parent_tag == DW_TAG_partial_unit) {
     symbol_context_scope = sc.comp_unit;
-  } else if (sc.function != NULL && sc_parent_die) {
+  } else if (sc.function != nullptr && sc_parent_die) {
     symbol_context_scope =
         sc.function->GetBlock(true).FindBlockByID(sc_parent_die.GetID());
-    if (symbol_context_scope == NULL)
+    if (symbol_context_scope == nullptr)
       symbol_context_scope = sc.function;
   } else {
     symbol_context_scope = sc.module_sp.get();
   }
 
-  if (symbol_context_scope != NULL)
+  if (symbol_context_scope != nullptr)
     type_sp->SetSymbolContextScope(symbol_context_scope);
 
   // We are ready to put this type into the uniqued list up at the module
@@ -1930,300 +1969,214 @@ bool DWARFASTParserClang::ParseTemplateParameterInfos(
   return template_param_infos.args.size() == template_param_infos.names.size();
 }
 
-bool DWARFASTParserClang::CompleteTypeFromDWARF(const DWARFDIE &die,
-                                                lldb_private::Type *type,
-                                                CompilerType &clang_type) {
+bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die,
+                                             lldb_private::Type *type,
+                                             CompilerType &clang_type) {
+  const dw_tag_t tag = die.Tag();
   SymbolFileDWARF *dwarf = die.GetDWARF();
 
-  std::lock_guard<std::recursive_mutex> guard(
-      dwarf->GetObjectFile()->GetModule()->GetMutex());
+  ClangASTImporter::LayoutInfo layout_info;
 
-  // Disable external storage for this type so we don't get anymore
-  // clang::ExternalASTSource queries for this type.
-  m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), false);
-
-  if (!die)
-    return false;
-
-#if defined LLDB_CONFIGURATION_DEBUG
-  // For debugging purposes, the LLDB_DWARF_DONT_COMPLETE_TYPENAMES environment
-  // variable can be set with one or more typenames separated by ';'
-  // characters. This will cause this function to not complete any types whose
-  // names match.
-  //
-  // Examples of setting this environment variable:
-  //
-  // LLDB_DWARF_DONT_COMPLETE_TYPENAMES=Foo
-  // LLDB_DWARF_DONT_COMPLETE_TYPENAMES=Foo;Bar;Baz
-  const char *dont_complete_typenames_cstr =
-      getenv("LLDB_DWARF_DONT_COMPLETE_TYPENAMES");
-  if (dont_complete_typenames_cstr && dont_complete_typenames_cstr[0]) {
-    const char *die_name = die.GetName();
-    if (die_name && die_name[0]) {
-      const char *match = strstr(dont_complete_typenames_cstr, die_name);
-      if (match) {
-        size_t die_name_length = strlen(die_name);
-        while (match) {
-          const char separator_char = ';';
-          const char next_char = match[die_name_length];
-          if (next_char == '\0' || next_char == separator_char) {
-            if (match == dont_complete_typenames_cstr ||
-                match[-1] == separator_char)
-              return false;
-          }
-          match = strstr(match + 1, die_name);
-        }
+  {
+    if (die.HasChildren()) {
+      LanguageType class_language = eLanguageTypeUnknown;
+      if (ClangASTContext::IsObjCObjectOrInterfaceType(clang_type)) {
+        class_language = eLanguageTypeObjC;
+        // For objective C we don't start the definition when the class is
+        // created.
+        ClangASTContext::StartTagDeclarationDefinition(clang_type);
       }
-    }
-  }
-#endif
 
-  const dw_tag_t tag = die.Tag();
-
-  Log *log =
-      nullptr; // (LogChannelDWARF::GetLogIfAny(DWARF_LOG_DEBUG_INFO|DWARF_LOG_TYPE_COMPLETION));
-  if (log)
-    dwarf->GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
-        log, "0x%8.8" PRIx64 ": %s '%s' resolving forward declaration...",
-        die.GetID(), die.GetTagAsCString(), type->GetName().AsCString());
-  assert(clang_type);
-  DWARFAttributes attributes;
-  switch (tag) {
-  case DW_TAG_structure_type:
-  case DW_TAG_union_type:
-  case DW_TAG_class_type: {
-    ClangASTImporter::LayoutInfo layout_info;
-
-    {
-      if (die.HasChildren()) {
-        LanguageType class_language = eLanguageTypeUnknown;
-        if (ClangASTContext::IsObjCObjectOrInterfaceType(clang_type)) {
-          class_language = eLanguageTypeObjC;
-          // For objective C we don't start the definition when the class is
-          // created.
-          ClangASTContext::StartTagDeclarationDefinition(clang_type);
-        }
-
-        int tag_decl_kind = -1;
-        AccessType default_accessibility = eAccessNone;
-        if (tag == DW_TAG_structure_type) {
-          tag_decl_kind = clang::TTK_Struct;
-          default_accessibility = eAccessPublic;
-        } else if (tag == DW_TAG_union_type) {
-          tag_decl_kind = clang::TTK_Union;
-          default_accessibility = eAccessPublic;
-        } else if (tag == DW_TAG_class_type) {
-          tag_decl_kind = clang::TTK_Class;
-          default_accessibility = eAccessPrivate;
-        }
-
-        std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> bases;
-        std::vector<int> member_accessibilities;
-        bool is_a_class = false;
-        // Parse members and base classes first
-        std::vector<DWARFDIE> member_function_dies;
-
-        DelayedPropertyList delayed_properties;
-        ParseChildMembers(die, clang_type, class_language, bases,
-                          member_accessibilities, member_function_dies,
-                          delayed_properties, default_accessibility, is_a_class,
-                          layout_info);
-
-        // Now parse any methods if there were any...
-        for (const DWARFDIE &die : member_function_dies)
-          dwarf->ResolveType(die);
-
-        if (class_language == eLanguageTypeObjC) {
-          ConstString class_name(clang_type.GetTypeName());
-          if (class_name) {
-            DIEArray method_die_offsets;
-            dwarf->GetObjCMethodDIEOffsets(class_name, method_die_offsets);
-
-            if (!method_die_offsets.empty()) {
-              DWARFDebugInfo *debug_info = dwarf->DebugInfo();
-
-              const size_t num_matches = method_die_offsets.size();
-              for (size_t i = 0; i < num_matches; ++i) {
-                const DIERef &die_ref = method_die_offsets[i];
-                DWARFDIE method_die = debug_info->GetDIE(die_ref);
-
-                if (method_die)
-                  method_die.ResolveType();
-              }
-            }
-
-            for (DelayedPropertyList::iterator pi = delayed_properties.begin(),
-                                               pe = delayed_properties.end();
-                 pi != pe; ++pi)
-              pi->Finalize();
-          }
-        }
-
-        // If we have a DW_TAG_structure_type instead of a DW_TAG_class_type we
-        // need to tell the clang type it is actually a class.
-        if (class_language != eLanguageTypeObjC) {
-          if (is_a_class && tag_decl_kind != clang::TTK_Class)
-            m_ast.SetTagTypeKind(ClangUtil::GetQualType(clang_type),
-                                 clang::TTK_Class);
-        }
-
-        // Since DW_TAG_structure_type gets used for both classes and
-        // structures, we may need to set any DW_TAG_member fields to have a
-        // "private" access if none was specified. When we parsed the child
-        // members we tracked that actual accessibility value for each
-        // DW_TAG_member in the "member_accessibilities" array. If the value
-        // for the member is zero, then it was set to the
-        // "default_accessibility" which for structs was "public". Below we
-        // correct this by setting any fields to "private" that weren't
-        // correctly set.
-        if (is_a_class && !member_accessibilities.empty()) {
-          // This is a class and all members that didn't have their access
-          // specified are private.
-          m_ast.SetDefaultAccessForRecordFields(
-              m_ast.GetAsRecordDecl(clang_type), eAccessPrivate,
-              &member_accessibilities.front(), member_accessibilities.size());
-        }
+      int tag_decl_kind = -1;
+      AccessType default_accessibility = eAccessNone;
+      if (tag == DW_TAG_structure_type) {
+        tag_decl_kind = clang::TTK_Struct;
+        default_accessibility = eAccessPublic;
+      } else if (tag == DW_TAG_union_type) {
+        tag_decl_kind = clang::TTK_Union;
+        default_accessibility = eAccessPublic;
+      } else if (tag == DW_TAG_class_type) {
+        tag_decl_kind = clang::TTK_Class;
+        default_accessibility = eAccessPrivate;
+      }
 
-        if (!bases.empty()) {
-          // Make sure all base classes refer to complete types and not forward
-          // declarations. If we don't do this, clang will crash with an
-          // assertion in the call to clang_type.TransferBaseClasses()
-          for (const auto &base_class : bases) {
-            clang::TypeSourceInfo *type_source_info =
-                base_class->getTypeSourceInfo();
-            if (type_source_info) {
-              CompilerType base_class_type(
-                  &m_ast, type_source_info->getType().getAsOpaquePtr());
-              if (!base_class_type.GetCompleteType()) {
-                auto module = dwarf->GetObjectFile()->GetModule();
-                module->ReportError(":: Class '%s' has a base class '%s' which "
-                                    "does not have a complete definition.",
-                                    die.GetName(),
-                                    base_class_type.GetTypeName().GetCString());
-                if (die.GetCU()->GetProducer() == eProducerClang)
-                  module->ReportError(":: Try compiling the source file with "
-                                      "-fstandalone-debug.");
-
-                // We have no choice other than to pretend that the base class
-                // is complete. If we don't do this, clang will crash when we
-                // call setBases() inside of
-                // "clang_type.TransferBaseClasses()" below. Since we
-                // provide layout assistance, all ivars in this class and other
-                // classes will be fine, this is the best we can do short of
-                // crashing.
-                if (ClangASTContext::StartTagDeclarationDefinition(
-                        base_class_type)) {
-                  ClangASTContext::CompleteTagDeclarationDefinition(
-                      base_class_type);
-                }
-              }
+      std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> bases;
+      std::vector<int> member_accessibilities;
+      bool is_a_class = false;
+      // Parse members and base classes first
+      std::vector<DWARFDIE> member_function_dies;
+
+      DelayedPropertyList delayed_properties;
+      ParseChildMembers(die, clang_type, class_language, bases,
+                        member_accessibilities, member_function_dies,
+                        delayed_properties, default_accessibility, is_a_class,
+                        layout_info);
+
+      // Now parse any methods if there were any...
+      for (const DWARFDIE &die : member_function_dies)
+        dwarf->ResolveType(die);
+
+      if (class_language == eLanguageTypeObjC) {
+        ConstString class_name(clang_type.GetTypeName());
+        if (class_name) {
+          DIEArray method_die_offsets;
+          dwarf->GetObjCMethodDIEOffsets(class_name, method_die_offsets);
+
+          if (!method_die_offsets.empty()) {
+            DWARFDebugInfo *debug_info = dwarf->DebugInfo();
+
+            const size_t num_matches = method_die_offsets.size();
+            for (size_t i = 0; i < num_matches; ++i) {
+              const DIERef &die_ref = method_die_offsets[i];
+              DWARFDIE method_die = debug_info->GetDIE(die_ref);
+
+              if (method_die)
+                method_die.ResolveType();
             }
           }
 
-          m_ast.TransferBaseClasses(clang_type.GetOpaqueQualType(),
-                                    std::move(bases));
+          for (DelayedPropertyList::iterator pi = delayed_properties.begin(),
+                                             pe = delayed_properties.end();
+               pi != pe; ++pi)
+            pi->Finalize();
         }
       }
-    }
 
-    m_ast.AddMethodOverridesForCXXRecordType(clang_type.GetOpaqueQualType());
-    ClangASTContext::BuildIndirectFields(clang_type);
-    ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
-
-    if (!layout_info.field_offsets.empty() ||
-        !layout_info.base_offsets.empty() ||
-        !layout_info.vbase_offsets.empty()) {
-      if (type)
-        layout_info.bit_size = type->GetByteSize().getValueOr(0) * 8;
-      if (layout_info.bit_size == 0)
-        layout_info.bit_size =
-            die.GetAttributeValueAsUnsigned(DW_AT_byte_size, 0) * 8;
-
-      clang::CXXRecordDecl *record_decl =
-          m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
-      if (record_decl) {
-        if (log) {
-          ModuleSP module_sp = dwarf->GetObjectFile()->GetModule();
+      // If we have a DW_TAG_structure_type instead of a DW_TAG_class_type we
+      // need to tell the clang type it is actually a class.
+      if (class_language != eLanguageTypeObjC) {
+        if (is_a_class && tag_decl_kind != clang::TTK_Class)
+          m_ast.SetTagTypeKind(ClangUtil::GetQualType(clang_type),
+                               clang::TTK_Class);
+      }
 
-          if (module_sp) {
-            module_sp->LogMessage(
-                log,
-                "ClangASTContext::CompleteTypeFromDWARF (clang_type = %p) "
-                "caching layout info for record_decl = %p, bit_size = %" PRIu64
-                ", alignment = %" PRIu64
-                ", field_offsets[%u], base_offsets[%u], vbase_offsets[%u])",
-                static_cast<void *>(clang_type.GetOpaqueQualType()),
-                static_cast<void *>(record_decl), layout_info.bit_size,
-                layout_info.alignment,
-                static_cast<uint32_t>(layout_info.field_offsets.size()),
-                static_cast<uint32_t>(layout_info.base_offsets.size()),
-                static_cast<uint32_t>(layout_info.vbase_offsets.size()));
-
-            uint32_t idx;
-            {
-              llvm::DenseMap<const clang::FieldDecl *, uint64_t>::const_iterator
-                  pos,
-                  end = layout_info.field_offsets.end();
-              for (idx = 0, pos = layout_info.field_offsets.begin(); pos != end;
-                   ++pos, ++idx) {
-                module_sp->LogMessage(
-                    log, "ClangASTContext::CompleteTypeFromDWARF (clang_type = "
-                         "%p) field[%u] = { bit_offset=%u, name='%s' }",
-                    static_cast<void *>(clang_type.GetOpaqueQualType()), idx,
-                    static_cast<uint32_t>(pos->second),
-                    pos->first->getNameAsString().c_str());
-              }
-            }
+      // Since DW_TAG_structure_type gets used for both classes and
+      // structures, we may need to set any DW_TAG_member fields to have a
+      // "private" access if none was specified. When we parsed the child
+      // members we tracked that actual accessibility value for each
+      // DW_TAG_member in the "member_accessibilities" array. If the value
+      // for the member is zero, then it was set to the
+      // "default_accessibility" which for structs was "public". Below we
+      // correct this by setting any fields to "private" that weren't
+      // correctly set.
+      if (is_a_class && !member_accessibilities.empty()) {
+        // This is a class and all members that didn't have their access
+        // specified are private.
+        m_ast.SetDefaultAccessForRecordFields(
+            m_ast.GetAsRecordDecl(clang_type), eAccessPrivate,
+            &member_accessibilities.front(), member_accessibilities.size());
+      }
 
-            {
-              llvm::DenseMap<const clang::CXXRecordDecl *,
-                             clang::CharUnits>::const_iterator base_pos,
-                  base_end = layout_info.base_offsets.end();
-              for (idx = 0, base_pos = layout_info.base_offsets.begin();
-                   base_pos != base_end; ++base_pos, ++idx) {
-                module_sp->LogMessage(
-                    log, "ClangASTContext::CompleteTypeFromDWARF (clang_type = "
-                         "%p) base[%u] = { byte_offset=%u, name='%s' }",
-                    clang_type.GetOpaqueQualType(), idx,
-                    (uint32_t)base_pos->second.getQuantity(),
-                    base_pos->first->getNameAsString().c_str());
-              }
-            }
-            {
-              llvm::DenseMap<const clang::CXXRecordDecl *,
-                             clang::CharUnits>::const_iterator vbase_pos,
-                  vbase_end = layout_info.vbase_offsets.end();
-              for (idx = 0, vbase_pos = layout_info.vbase_offsets.begin();
-                   vbase_pos != vbase_end; ++vbase_pos, ++idx) {
-                module_sp->LogMessage(
-                    log, "ClangASTContext::CompleteTypeFromDWARF (clang_type = "
-                         "%p) vbase[%u] = { byte_offset=%u, name='%s' }",
-                    static_cast<void *>(clang_type.GetOpaqueQualType()), idx,
-                    static_cast<uint32_t>(vbase_pos->second.getQuantity()),
-                    vbase_pos->first->getNameAsString().c_str());
+      if (!bases.empty()) {
+        // Make sure all base classes refer to complete types and not forward
+        // declarations. If we don't do this, clang will crash with an
+        // assertion in the call to clang_type.TransferBaseClasses()
+        for (const auto &base_class : bases) {
+          clang::TypeSourceInfo *type_source_info =
+              base_class->getTypeSourceInfo();
+          if (type_source_info) {
+            CompilerType base_class_type(
+                &m_ast, type_source_info->getType().getAsOpaquePtr());
+            if (!base_class_type.GetCompleteType()) {
+              auto module = dwarf->GetObjectFile()->GetModule();
+              module->ReportError(":: Class '%s' has a base class '%s' which "
+                                  "does not have a complete definition.",
+                                  die.GetName(),
+                                  base_class_type.GetTypeName().GetCString());
+              if (die.GetCU()->GetProducer() == eProducerClang)
+                module->ReportError(":: Try compiling the source file with "
+                                    "-fstandalone-debug.");
+
+              // We have no choice other than to pretend that the base class
+              // is complete. If we don't do this, clang will crash when we
+              // call setBases() inside of
+              // "clang_type.TransferBaseClasses()" below. Since we
+              // provide layout assistance, all ivars in this class and other
+              // classes will be fine, this is the best we can do short of
+              // crashing.
+              if (ClangASTContext::StartTagDeclarationDefinition(
+                      base_class_type)) {
+                ClangASTContext::CompleteTagDeclarationDefinition(
+                    base_class_type);
               }
             }
           }
         }
-        GetClangASTImporter().InsertRecordDecl(record_decl, layout_info);
+
+        m_ast.TransferBaseClasses(clang_type.GetOpaqueQualType(),
+                                  std::move(bases));
       }
     }
   }
 
-    return (bool)clang_type;
+  m_ast.AddMethodOverridesForCXXRecordType(clang_type.GetOpaqueQualType());
+  ClangASTContext::BuildIndirectFields(clang_type);
+  ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
 
-  case DW_TAG_enumeration_type:
-    if (ClangASTContext::StartTagDeclarationDefinition(clang_type)) {
-      if (die.HasChildren()) {
-        bool is_signed = false;
-        clang_type.IsIntegerType(is_signed);
-        ParseChildEnumerators(clang_type, is_signed,
-                              type->GetByteSize().getValueOr(0), die);
-      }
-      ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
+  if (!layout_info.field_offsets.empty() || !layout_info.base_offsets.empty() ||
+      !layout_info.vbase_offsets.empty()) {
+    if (type)
+      layout_info.bit_size = type->GetByteSize().getValueOr(0) * 8;
+    if (layout_info.bit_size == 0)
+      layout_info.bit_size =
+          die.GetAttributeValueAsUnsigned(DW_AT_byte_size, 0) * 8;
+
+    clang::CXXRecordDecl *record_decl =
+        m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
+    if (record_decl)
+      GetClangASTImporter().InsertRecordDecl(record_decl, layout_info);
+  }
+
+  return (bool)clang_type;
+}
+
+bool DWARFASTParserClang::CompleteEnumType(const DWARFDIE &die,
+                                           lldb_private::Type *type,
+                                           CompilerType &clang_type) {
+  if (ClangASTContext::StartTagDeclarationDefinition(clang_type)) {
+    if (die.HasChildren()) {
+      bool is_signed = false;
+      clang_type.IsIntegerType(is_signed);
+      ParseChildEnumerators(clang_type, is_signed,
+                            type->GetByteSize().getValueOr(0), die);
     }
-    return (bool)clang_type;
+    ClangASTContext::CompleteTagDeclarationDefinition(clang_type);
+  }
+  return (bool)clang_type;
+}
+
+bool DWARFASTParserClang::CompleteTypeFromDWARF(const DWARFDIE &die,
+                                                lldb_private::Type *type,
+                                                CompilerType &clang_type) {
+  SymbolFileDWARF *dwarf = die.GetDWARF();
+
+  std::lock_guard<std::recursive_mutex> guard(
+      dwarf->GetObjectFile()->GetModule()->GetMutex());
+
+  // Disable external storage for this type so we don't get anymore
+  // clang::ExternalASTSource queries for this type.
+  m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), false);
+
+  if (!die)
+    return false;
+
+  const dw_tag_t tag = die.Tag();
 
+  Log *log =
+      nullptr; // (LogChannelDWARF::GetLogIfAny(DWARF_LOG_DEBUG_INFO|DWARF_LOG_TYPE_COMPLETION));
+  if (log)
+    dwarf->GetObjectFile()->GetModule()->LogMessageVerboseBacktrace(
+        log, "0x%8.8" PRIx64 ": %s '%s' resolving forward declaration...",
+        die.GetID(), die.GetTagAsCString(), type->GetName().AsCString());
+  assert(clang_type);
+  DWARFAttributes attributes;
+  switch (tag) {
+  case DW_TAG_structure_type:
+  case DW_TAG_union_type:
+  case DW_TAG_class_type:
+    return CompleteRecordType(die, type, clang_type);
+  case DW_TAG_enumeration_type:
+    return CompleteEnumType(die, type, clang_type);
   default:
     assert(false && "not a forward clang type decl!");
     break;
@@ -2495,495 +2448,500 @@ Function *DWARFASTParserClang::ParseFunctionFromDWARF(CompileUnit &comp_unit,
   return nullptr;
 }
 
-bool DWARFASTParserClang::ParseChildMembers(
-    const DWARFDIE &parent_die, CompilerType &class_clang_type,
-    const LanguageType class_language,
-    std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes,
+void DWARFASTParserClang::ParseSingleMember(
+    const DWARFDIE &die, const DWARFDIE &parent_die,
+    lldb_private::CompilerType &class_clang_type,
+    const lldb::LanguageType class_language,
     std::vector<int> &member_accessibilities,
-    std::vector<DWARFDIE> &member_function_dies,
-    DelayedPropertyList &delayed_properties, AccessType &default_accessibility,
-    bool &is_a_class, ClangASTImporter::LayoutInfo &layout_info) {
-  if (!parent_die)
-    return false;
-
+    lldb::AccessType &default_accessibility,
+    DelayedPropertyList &delayed_properties,
+    lldb_private::ClangASTImporter::LayoutInfo &layout_info,
+    BitfieldInfo &last_field_info) {
+  ModuleSP module_sp = parent_die.GetDWARF()->GetObjectFile()->GetModule();
+  const dw_tag_t tag = die.Tag();
   // Get the parent byte size so we can verify any members will fit
   const uint64_t parent_byte_size =
       parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size, UINT64_MAX);
   const uint64_t parent_bit_size =
       parent_byte_size == UINT64_MAX ? UINT64_MAX : parent_byte_size * 8;
 
-  uint32_t member_idx = 0;
-  BitfieldInfo last_field_info;
-
-  ModuleSP module_sp = parent_die.GetDWARF()->GetObjectFile()->GetModule();
-  ClangASTContext *ast =
-      llvm::dyn_cast_or_null<ClangASTContext>(class_clang_type.GetTypeSystem());
-  if (ast == nullptr)
-    return false;
-
-  for (DWARFDIE die = parent_die.GetFirstChild(); die.IsValid();
-       die = die.GetSibling()) {
-    dw_tag_t tag = die.Tag();
-
-    switch (tag) {
-    case DW_TAG_member:
-    case DW_TAG_APPLE_property: {
-      DWARFAttributes attributes;
-      const size_t num_attributes = die.GetAttributes(attributes);
-      if (num_attributes > 0) {
-        const char *name = nullptr;
-        const char *prop_name = nullptr;
-        const char *prop_getter_name = nullptr;
-        const char *prop_setter_name = nullptr;
-        uint32_t prop_attributes = 0;
-
-        bool is_artificial = false;
-        DWARFFormValue encoding_form;
-        AccessType accessibility = eAccessNone;
-        uint32_t member_byte_offset =
-            (parent_die.Tag() == DW_TAG_union_type) ? 0 : UINT32_MAX;
-        llvm::Optional<uint64_t> byte_size;
-        int64_t bit_offset = 0;
-        uint64_t data_bit_offset = UINT64_MAX;
-        size_t bit_size = 0;
-        bool is_external =
-            false; // On DW_TAG_members, this means the member is static
-        uint32_t i;
-        for (i = 0; i < num_attributes && !is_artificial; ++i) {
-          const dw_attr_t attr = attributes.AttributeAtIndex(i);
-          DWARFFormValue form_value;
-          if (attributes.ExtractFormValueAtIndex(i, form_value)) {
-            switch (attr) {
-            case DW_AT_name:
-              name = form_value.AsCString();
-              break;
-            case DW_AT_type:
-              encoding_form = form_value;
-              break;
-            case DW_AT_bit_offset:
-              bit_offset = form_value.Signed();
-              break;
-            case DW_AT_bit_size:
-              bit_size = form_value.Unsigned();
-              break;
-            case DW_AT_byte_size:
-              byte_size = form_value.Unsigned();
-              break;
-            case DW_AT_data_bit_offset:
-              data_bit_offset = form_value.Unsigned();
-              break;
-            case DW_AT_data_member_location:
-              if (form_value.BlockData()) {
-                Value initialValue(0);
-                Value memberOffset(0);
-                const DWARFDataExtractor &debug_info_data = die.GetData();
-                uint32_t block_length = form_value.Unsigned();
-                uint32_t block_offset =
-                    form_value.BlockData() - debug_info_data.GetDataStart();
-                if (DWARFExpression::Evaluate(
-                        nullptr, // ExecutionContext *
-                        nullptr, // RegisterContext *
-                        module_sp,
-                        DataExtractor(debug_info_data, block_offset,
-                                      block_length),
-                        die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
-                        memberOffset, nullptr)) {
-                  member_byte_offset =
-                      memberOffset.ResolveValue(nullptr).UInt();
-                }
-              } else {
-                // With DWARF 3 and later, if the value is an integer constant,
-                // this form value is the offset in bytes from the beginning of
-                // the containing entity.
-                member_byte_offset = form_value.Unsigned();
-              }
-              break;
-
-            case DW_AT_accessibility:
-              accessibility = DW_ACCESS_to_AccessType(form_value.Unsigned());
-              break;
-            case DW_AT_artificial:
-              is_artificial = form_value.Boolean();
-              break;
-            case DW_AT_APPLE_property_name:
-              prop_name = form_value.AsCString();
-              break;
-            case DW_AT_APPLE_property_getter:
-              prop_getter_name = form_value.AsCString();
-              break;
-            case DW_AT_APPLE_property_setter:
-              prop_setter_name = form_value.AsCString();
-              break;
-            case DW_AT_APPLE_property_attribute:
-              prop_attributes = form_value.Unsigned();
-              break;
-            case DW_AT_external:
-              is_external = form_value.Boolean();
-              break;
-
-            default:
-            case DW_AT_declaration:
-            case DW_AT_description:
-            case DW_AT_mutable:
-            case DW_AT_visibility:
-            case DW_AT_sibling:
-              break;
+  DWARFAttributes attributes;
+  const size_t num_attributes = die.GetAttributes(attributes);
+  if (num_attributes > 0) {
+    const char *name = nullptr;
+    const char *prop_name = nullptr;
+    const char *prop_getter_name = nullptr;
+    const char *prop_setter_name = nullptr;
+    uint32_t prop_attributes = 0;
+
+    bool is_artificial = false;
+    DWARFFormValue encoding_form;
+    AccessType accessibility = eAccessNone;
+    uint32_t member_byte_offset =
+        (parent_die.Tag() == DW_TAG_union_type) ? 0 : UINT32_MAX;
+    llvm::Optional<uint64_t> byte_size;
+    int64_t bit_offset = 0;
+    uint64_t data_bit_offset = UINT64_MAX;
+    size_t bit_size = 0;
+    bool is_external =
+        false; // On DW_TAG_members, this means the member is static
+    uint32_t i;
+    for (i = 0; i < num_attributes && !is_artificial; ++i) {
+      const dw_attr_t attr = attributes.AttributeAtIndex(i);
+      DWARFFormValue form_value;
+      if (attributes.ExtractFormValueAtIndex(i, form_value)) {
+        switch (attr) {
+        case DW_AT_name:
+          name = form_value.AsCString();
+          break;
+        case DW_AT_type:
+          encoding_form = form_value;
+          break;
+        case DW_AT_bit_offset:
+          bit_offset = form_value.Signed();
+          break;
+        case DW_AT_bit_size:
+          bit_size = form_value.Unsigned();
+          break;
+        case DW_AT_byte_size:
+          byte_size = form_value.Unsigned();
+          break;
+        case DW_AT_data_bit_offset:
+          data_bit_offset = form_value.Unsigned();
+          break;
+        case DW_AT_data_member_location:
+          if (form_value.BlockData()) {
+            Value initialValue(0);
+            Value memberOffset(0);
+            const DWARFDataExtractor &debug_info_data = die.GetData();
+            uint32_t block_length = form_value.Unsigned();
+            uint32_t block_offset =
+                form_value.BlockData() - debug_info_data.GetDataStart();
+            if (DWARFExpression::Evaluate(
+                    nullptr, // ExecutionContext *
+                    nullptr, // RegisterContext *
+                    module_sp,
+                    DataExtractor(debug_info_data, block_offset, block_length),
+                    die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr,
+                    memberOffset, nullptr)) {
+              member_byte_offset = memberOffset.ResolveValue(nullptr).UInt();
             }
+          } else {
+            // With DWARF 3 and later, if the value is an integer constant,
+            // this form value is the offset in bytes from the beginning of
+            // the containing entity.
+            member_byte_offset = form_value.Unsigned();
           }
-        }
-
-        if (prop_name) {
-          ConstString fixed_getter;
-          ConstString fixed_setter;
-
-          // Check if the property getter/setter were provided as full names.
-          // We want basenames, so we extract them.
-
-          if (prop_getter_name && prop_getter_name[0] == '-') {
-            ObjCLanguage::MethodName prop_getter_method(prop_getter_name, true);
-            prop_getter_name = prop_getter_method.GetSelector().GetCString();
-          }
+          break;
 
-          if (prop_setter_name && prop_setter_name[0] == '-') {
-            ObjCLanguage::MethodName prop_setter_method(prop_setter_name, true);
-            prop_setter_name = prop_setter_method.GetSelector().GetCString();
-          }
+        case DW_AT_accessibility:
+          accessibility = DW_ACCESS_to_AccessType(form_value.Unsigned());
+          break;
+        case DW_AT_artificial:
+          is_artificial = form_value.Boolean();
+          break;
+        case DW_AT_APPLE_property_name:
+          prop_name = form_value.AsCString();
+          break;
+        case DW_AT_APPLE_property_getter:
+          prop_getter_name = form_value.AsCString();
+          break;
+        case DW_AT_APPLE_property_setter:
+          prop_setter_name = form_value.AsCString();
+          break;
+        case DW_AT_APPLE_property_attribute:
+          prop_attributes = form_value.Unsigned();
+          break;
+        case DW_AT_external:
+          is_external = form_value.Boolean();
+          break;
 
-          // If the names haven't been provided, they need to be filled in.
+        default:
+        case DW_AT_declaration:
+        case DW_AT_description:
+        case DW_AT_mutable:
+        case DW_AT_visibility:
+        case DW_AT_sibling:
+          break;
+        }
+      }
+    }
 
-          if (!prop_getter_name) {
-            prop_getter_name = prop_name;
-          }
-          if (!prop_setter_name && prop_name[0] &&
-              !(prop_attributes & DW_APPLE_PROPERTY_readonly)) {
-            StreamString ss;
+    if (prop_name) {
+      ConstString fixed_getter;
+      ConstString fixed_setter;
 
-            ss.Printf("set%c%s:", toupper(prop_name[0]), &prop_name[1]);
+      // Check if the property getter/setter were provided as full names.
+      // We want basenames, so we extract them.
 
-            fixed_setter.SetString(ss.GetString());
-            prop_setter_name = fixed_setter.GetCString();
-          }
-        }
+      if (prop_getter_name && prop_getter_name[0] == '-') {
+        ObjCLanguage::MethodName prop_getter_method(prop_getter_name, true);
+        prop_getter_name = prop_getter_method.GetSelector().GetCString();
+      }
 
-        // Clang has a DWARF generation bug where sometimes it represents
-        // fields that are references with bad byte size and bit size/offset
-        // information such as:
-        //
-        //  DW_AT_byte_size( 0x00 )
-        //  DW_AT_bit_size( 0x40 )
-        //  DW_AT_bit_offset( 0xffffffffffffffc0 )
-        //
-        // So check the bit offset to make sure it is sane, and if the values
-        // are not sane, remove them. If we don't do this then we will end up
-        // with a crash if we try to use this type in an expression when clang
-        // becomes unhappy with its recycled debug info.
-
-        if (byte_size.getValueOr(0) == 0 && bit_offset < 0) {
-          bit_size = 0;
-          bit_offset = 0;
-        }
+      if (prop_setter_name && prop_setter_name[0] == '-') {
+        ObjCLanguage::MethodName prop_setter_method(prop_setter_name, true);
+        prop_setter_name = prop_setter_method.GetSelector().GetCString();
+      }
 
-        // FIXME: Make Clang ignore Objective-C accessibility for expressions
-        if (class_language == eLanguageTypeObjC ||
-            class_language == eLanguageTypeObjC_plus_plus)
-          accessibility = eAccessNone;
-
-        // Handle static members
-        if (is_external && member_byte_offset == UINT32_MAX) {
-          Type *var_type = die.ResolveTypeUID(encoding_form.Reference());
-
-          if (var_type) {
-            if (accessibility == eAccessNone)
-              accessibility = eAccessPublic;
-            ClangASTContext::AddVariableToRecordType(
-                class_clang_type, name, var_type->GetLayoutCompilerType(),
-                accessibility);
-          }
-          break;
-        }
+      // If the names haven't been provided, they need to be filled in.
 
-        if (!is_artificial) {
-          Type *member_type = die.ResolveTypeUID(encoding_form.Reference());
-
-          clang::FieldDecl *field_decl = nullptr;
-          if (tag == DW_TAG_member) {
-            if (member_type) {
-              if (accessibility == eAccessNone)
-                accessibility = default_accessibility;
-              member_accessibilities.push_back(accessibility);
-
-              uint64_t field_bit_offset =
-                  (member_byte_offset == UINT32_MAX ? 0
-                                                    : (member_byte_offset * 8));
-              if (bit_size > 0) {
-
-                BitfieldInfo this_field_info;
-                this_field_info.bit_offset = field_bit_offset;
-                this_field_info.bit_size = bit_size;
-
-                /////////////////////////////////////////////////////////////
-                // How to locate a field given the DWARF debug information
-                //
-                // AT_byte_size indicates the size of the word in which the bit
-                // offset must be interpreted.
-                //
-                // AT_data_member_location indicates the byte offset of the
-                // word from the base address of the structure.
-                //
-                // AT_bit_offset indicates how many bits into the word
-                // (according to the host endianness) the low-order bit of the
-                // field starts.  AT_bit_offset can be negative.
-                //
-                // AT_bit_size indicates the size of the field in bits.
-                /////////////////////////////////////////////////////////////
-
-                if (data_bit_offset != UINT64_MAX) {
-                  this_field_info.bit_offset = data_bit_offset;
-                } else {
-                  if (!byte_size)
-                    byte_size = member_type->GetByteSize();
-
-                  ObjectFile *objfile = die.GetDWARF()->GetObjectFile();
-                  if (objfile->GetByteOrder() == eByteOrderLittle) {
-                    this_field_info.bit_offset += byte_size.getValueOr(0) * 8;
-                    this_field_info.bit_offset -= (bit_offset + bit_size);
-                  } else {
-                    this_field_info.bit_offset += bit_offset;
-                  }
-                }
+      if (!prop_getter_name) {
+        prop_getter_name = prop_name;
+      }
+      if (!prop_setter_name && prop_name[0] &&
+          !(prop_attributes & DW_APPLE_PROPERTY_readonly)) {
+        StreamString ss;
 
-                if ((this_field_info.bit_offset >= parent_bit_size) ||
-                    !last_field_info.NextBitfieldOffsetIsValid(
-                        this_field_info.bit_offset)) {
-                  ObjectFile *objfile = die.GetDWARF()->GetObjectFile();
-                  objfile->GetModule()->ReportWarning(
-                      "0x%8.8" PRIx64 ": %s bitfield named \"%s\" has invalid "
-                      "bit offset (0x%8.8" PRIx64
-                      ") member will be ignored. Please file a bug against the "
-                      "compiler and include the preprocessed output for %s\n",
-                      die.GetID(), DW_TAG_value_to_name(tag), name,
-                      this_field_info.bit_offset,
-                      GetUnitName(parent_die).c_str());
-                  this_field_info.Clear();
-                  continue;
-                }
+        ss.Printf("set%c%s:", toupper(prop_name[0]), &prop_name[1]);
 
-                // Update the field bit offset we will report for layout
-                field_bit_offset = this_field_info.bit_offset;
-
-                // If the member to be emitted did not start on a character
-                // boundary and there is empty space between the last field and
-                // this one, then we need to emit an anonymous member filling
-                // up the space up to its start.  There are three cases here:
-                //
-                // 1 If the previous member ended on a character boundary, then
-                // we can emit an
-                //   anonymous member starting at the most recent character
-                //   boundary.
-                //
-                // 2 If the previous member did not end on a character boundary
-                // and the distance
-                //   from the end of the previous member to the current member
-                //   is less than a
-                //   word width, then we can emit an anonymous member starting
-                //   right after the
-                //   previous member and right before this member.
-                //
-                // 3 If the previous member did not end on a character boundary
-                // and the distance
-                //   from the end of the previous member to the current member
-                //   is greater than
-                //   or equal a word width, then we act as in Case 1.
-
-                const uint64_t character_width = 8;
-                const uint64_t word_width = 32;
-
-                // Objective-C has invalid DW_AT_bit_offset values in older
-                // versions of clang, so we have to be careful and only insert
-                // unnamed bitfields if we have a new enough clang.
-                bool detect_unnamed_bitfields = true;
-
-                if (class_language == eLanguageTypeObjC ||
-                    class_language == eLanguageTypeObjC_plus_plus)
-                  detect_unnamed_bitfields =
-                      die.GetCU()->Supports_unnamed_objc_bitfields();
-
-                if (detect_unnamed_bitfields) {
-                  BitfieldInfo anon_field_info;
-
-                  if ((this_field_info.bit_offset % character_width) !=
-                      0) // not char aligned
-                  {
-                    uint64_t last_field_end = 0;
-
-                    if (last_field_info.IsValid())
-                      last_field_end =
-                          last_field_info.bit_offset + last_field_info.bit_size;
-
-                    if (this_field_info.bit_offset != last_field_end) {
-                      if (((last_field_end % character_width) == 0) || // case 1
-                          (this_field_info.bit_offset - last_field_end >=
-                           word_width)) // case 3
-                      {
-                        anon_field_info.bit_size =
-                            this_field_info.bit_offset % character_width;
-                        anon_field_info.bit_offset =
-                            this_field_info.bit_offset -
-                            anon_field_info.bit_size;
-                      } else // case 2
-                      {
-                        anon_field_info.bit_size =
-                            this_field_info.bit_offset - last_field_end;
-                        anon_field_info.bit_offset = last_field_end;
-                      }
-                    }
-                  }
+        fixed_setter.SetString(ss.GetString());
+        prop_setter_name = fixed_setter.GetCString();
+      }
+    }
 
-                  if (anon_field_info.IsValid()) {
-                    clang::FieldDecl *unnamed_bitfield_decl =
-                        ClangASTContext::AddFieldToRecordType(
-                            class_clang_type, llvm::StringRef(),
-                            m_ast.GetBuiltinTypeForEncodingAndBitSize(
-                                eEncodingSint, word_width),
-                            accessibility, anon_field_info.bit_size);
+    // Clang has a DWARF generation bug where sometimes it represents
+    // fields that are references with bad byte size and bit size/offset
+    // information such as:
+    //
+    //  DW_AT_byte_size( 0x00 )
+    //  DW_AT_bit_size( 0x40 )
+    //  DW_AT_bit_offset( 0xffffffffffffffc0 )
+    //
+    // So check the bit offset to make sure it is sane, and if the values
+    // are not sane, remove them. If we don't do this then we will end up
+    // with a crash if we try to use this type in an expression when clang
+    // becomes unhappy with its recycled debug info.
+
+    if (byte_size.getValueOr(0) == 0 && bit_offset < 0) {
+      bit_size = 0;
+      bit_offset = 0;
+    }
+
+    // FIXME: Make Clang ignore Objective-C accessibility for expressions
+    if (class_language == eLanguageTypeObjC ||
+        class_language == eLanguageTypeObjC_plus_plus)
+      accessibility = eAccessNone;
+
+    // Handle static members
+    if (is_external && member_byte_offset == UINT32_MAX) {
+      Type *var_type = die.ResolveTypeUID(encoding_form.Reference());
+
+      if (var_type) {
+        if (accessibility == eAccessNone)
+          accessibility = eAccessPublic;
+        ClangASTContext::AddVariableToRecordType(
+            class_clang_type, name, var_type->GetLayoutCompilerType(),
+            accessibility);
+      }
+      return;
+    }
+
+    if (!is_artificial) {
+      Type *member_type = die.ResolveTypeUID(encoding_form.Reference());
+
+      clang::FieldDecl *field_decl = nullptr;
+      if (tag == DW_TAG_member) {
+        if (member_type) {
+          if (accessibility == eAccessNone)
+            accessibility = default_accessibility;
+          member_accessibilities.push_back(accessibility);
+
+          uint64_t field_bit_offset =
+              (member_byte_offset == UINT32_MAX ? 0 : (member_byte_offset * 8));
+          if (bit_size > 0) {
+
+            BitfieldInfo this_field_info;
+            this_field_info.bit_offset = field_bit_offset;
+            this_field_info.bit_size = bit_size;
+
+            /////////////////////////////////////////////////////////////
+            // How to locate a field given the DWARF debug information
+            //
+            // AT_byte_size indicates the size of the word in which the bit
+            // offset must be interpreted.
+            //
+            // AT_data_member_location indicates the byte offset of the
+            // word from the base address of the structure.
+            //
+            // AT_bit_offset indicates how many bits into the word
+            // (according to the host endianness) the low-order bit of the
+            // field starts.  AT_bit_offset can be negative.
+            //
+            // AT_bit_size indicates the size of the field in bits.
+            /////////////////////////////////////////////////////////////
+
+            if (data_bit_offset != UINT64_MAX) {
+              this_field_info.bit_offset = data_bit_offset;
+            } else {
+              if (!byte_size)
+                byte_size = member_type->GetByteSize();
 
-                    layout_info.field_offsets.insert(std::make_pair(
-                        unnamed_bitfield_decl, anon_field_info.bit_offset));
-                  }
-                }
-                last_field_info = this_field_info;
+              ObjectFile *objfile = die.GetDWARF()->GetObjectFile();
+              if (objfile->GetByteOrder() == eByteOrderLittle) {
+                this_field_info.bit_offset += byte_size.getValueOr(0) * 8;
+                this_field_info.bit_offset -= (bit_offset + bit_size);
               } else {
-                last_field_info.Clear();
+                this_field_info.bit_offset += bit_offset;
               }
+            }
 
-              CompilerType member_clang_type =
-                  member_type->GetLayoutCompilerType();
-              if (!member_clang_type.IsCompleteType())
-                member_clang_type.GetCompleteType();
+            if ((this_field_info.bit_offset >= parent_bit_size) ||
+                !last_field_info.NextBitfieldOffsetIsValid(
+                    this_field_info.bit_offset)) {
+              ObjectFile *objfile = die.GetDWARF()->GetObjectFile();
+              objfile->GetModule()->ReportWarning(
+                  "0x%8.8" PRIx64 ": %s bitfield named \"%s\" has invalid "
+                  "bit offset (0x%8.8" PRIx64
+                  ") member will be ignored. Please file a bug against the "
+                  "compiler and include the preprocessed output for %s\n",
+                  die.GetID(), DW_TAG_value_to_name(tag), name,
+                  this_field_info.bit_offset, GetUnitName(parent_die).c_str());
+              this_field_info.Clear();
+              return;
+            }
 
+            // Update the field bit offset we will report for layout
+            field_bit_offset = this_field_info.bit_offset;
+
+            // If the member to be emitted did not start on a character
+            // boundary and there is empty space between the last field and
+            // this one, then we need to emit an anonymous member filling
+            // up the space up to its start.  There are three cases here:
+            //
+            // 1 If the previous member ended on a character boundary, then
+            // we can emit an
+            //   anonymous member starting at the most recent character
+            //   boundary.
+            //
+            // 2 If the previous member did not end on a character boundary
+            // and the distance
+            //   from the end of the previous member to the current member
+            //   is less than a
+            //   word width, then we can emit an anonymous member starting
+            //   right after the
+            //   previous member and right before this member.
+            //
+            // 3 If the previous member did not end on a character boundary
+            // and the distance
+            //   from the end of the previous member to the current member
+            //   is greater than
+            //   or equal a word width, then we act as in Case 1.
+
+            const uint64_t character_width = 8;
+            const uint64_t word_width = 32;
+
+            // Objective-C has invalid DW_AT_bit_offset values in older
+            // versions of clang, so we have to be careful and only insert
+            // unnamed bitfields if we have a new enough clang.
+            bool detect_unnamed_bitfields = true;
+
+            if (class_language == eLanguageTypeObjC ||
+                class_language == eLanguageTypeObjC_plus_plus)
+              detect_unnamed_bitfields =
+                  die.GetCU()->Supports_unnamed_objc_bitfields();
+
+            if (detect_unnamed_bitfields) {
+              BitfieldInfo anon_field_info;
+
+              if ((this_field_info.bit_offset % character_width) !=
+                  0) // not char aligned
               {
-                // Older versions of clang emit array[0] and array[1] in the
-                // same way (<rdar://problem/12566646>). If the current field
-                // is at the end of the structure, then there is definitely no
-                // room for extra elements and we override the type to
-                // array[0].
-
-                CompilerType member_array_element_type;
-                uint64_t member_array_size;
-                bool member_array_is_incomplete;
-
-                if (member_clang_type.IsArrayType(
-                        &member_array_element_type, &member_array_size,
-                        &member_array_is_incomplete) &&
-                    !member_array_is_incomplete) {
-                  uint64_t parent_byte_size =
-                      parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size,
-                                                             UINT64_MAX);
-
-                  if (member_byte_offset >= parent_byte_size) {
-                    if (member_array_size != 1 &&
-                        (member_array_size != 0 ||
-                         member_byte_offset > parent_byte_size)) {
-                      module_sp->ReportError(
-                          "0x%8.8" PRIx64
-                          ": DW_TAG_member '%s' refers to type 0x%8.8x"
-                          " which extends beyond the bounds of 0x%8.8" PRIx64,
-                          die.GetID(), name,
-                          encoding_form.Reference().GetOffset(),
-                          parent_die.GetID());
-                    }
+                uint64_t last_field_end = 0;
 
-                    member_clang_type = m_ast.CreateArrayType(
-                        member_array_element_type, 0, false);
+                if (last_field_info.IsValid())
+                  last_field_end =
+                      last_field_info.bit_offset + last_field_info.bit_size;
+
+                if (this_field_info.bit_offset != last_field_end) {
+                  if (((last_field_end % character_width) == 0) || // case 1
+                      (this_field_info.bit_offset - last_field_end >=
+                       word_width)) // case 3
+                  {
+                    anon_field_info.bit_size =
+                        this_field_info.bit_offset % character_width;
+                    anon_field_info.bit_offset =
+                        this_field_info.bit_offset - anon_field_info.bit_size;
+                  } else // case 2
+                  {
+                    anon_field_info.bit_size =
+                        this_field_info.bit_offset - last_field_end;
+                    anon_field_info.bit_offset = last_field_end;
                   }
                 }
               }
 
-              if (ClangASTContext::IsCXXClassType(member_clang_type) &&
-                  !member_clang_type.GetCompleteType()) {
-                if (die.GetCU()->GetProducer() == eProducerClang)
-                  module_sp->ReportError(
-                      "DWARF DIE at 0x%8.8x (class %s) has a member variable "
-                      "0x%8.8x (%s) whose type is a forward declaration, not a "
-                      "complete definition.\nTry compiling the source file "
-                      "with -fstandalone-debug",
-                      parent_die.GetOffset(), parent_die.GetName(),
-                      die.GetOffset(), name);
-                else
-                  module_sp->ReportError(
-                      "DWARF DIE at 0x%8.8x (class %s) has a member variable "
-                      "0x%8.8x (%s) whose type is a forward declaration, not a "
-                      "complete definition.\nPlease file a bug against the "
-                      "compiler and include the preprocessed output for %s",
-                      parent_die.GetOffset(), parent_die.GetName(),
-                      die.GetOffset(), name, GetUnitName(parent_die).c_str());
-                // We have no choice other than to pretend that the member
-                // class is complete. If we don't do this, clang will crash
-                // when trying to layout the class. Since we provide layout
-                // assistance, all ivars in this class and other classes will
-                // be fine, this is the best we can do short of crashing.
-                if (ClangASTContext::StartTagDeclarationDefinition(
-                        member_clang_type)) {
-                  ClangASTContext::CompleteTagDeclarationDefinition(
-                      member_clang_type);
-                } else {
-                  module_sp->ReportError(
-                      "DWARF DIE at 0x%8.8x (class %s) has a member variable "
-                      "0x%8.8x (%s) whose type claims to be a C++ class but we "
-                      "were not able to start its definition.\nPlease file a "
-                      "bug and attach the file at the start of this error "
-                      "message",
-                      parent_die.GetOffset(), parent_die.GetName(),
-                      die.GetOffset(), name);
-                }
+              if (anon_field_info.IsValid()) {
+                clang::FieldDecl *unnamed_bitfield_decl =
+                    ClangASTContext::AddFieldToRecordType(
+                        class_clang_type, llvm::StringRef(),
+                        m_ast.GetBuiltinTypeForEncodingAndBitSize(eEncodingSint,
+                                                                  word_width),
+                        accessibility, anon_field_info.bit_size);
+
+                layout_info.field_offsets.insert(std::make_pair(
+                    unnamed_bitfield_decl, anon_field_info.bit_offset));
               }
+            }
+            last_field_info = this_field_info;
+          } else {
+            last_field_info.Clear();
+          }
 
-              field_decl = ClangASTContext::AddFieldToRecordType(
-                  class_clang_type, name, member_clang_type, accessibility,
-                  bit_size);
+          CompilerType member_clang_type = member_type->GetLayoutCompilerType();
+          if (!member_clang_type.IsCompleteType())
+            member_clang_type.GetCompleteType();
+
+          {
+            // Older versions of clang emit array[0] and array[1] in the
+            // same way (<rdar://problem/12566646>). If the current field
+            // is at the end of the structure, then there is definitely no
+            // room for extra elements and we override the type to
+            // array[0].
+
+            CompilerType member_array_element_type;
+            uint64_t member_array_size;
+            bool member_array_is_incomplete;
+
+            if (member_clang_type.IsArrayType(&member_array_element_type,
+                                              &member_array_size,
+                                              &member_array_is_incomplete) &&
+                !member_array_is_incomplete) {
+              uint64_t parent_byte_size =
+                  parent_die.GetAttributeValueAsUnsigned(DW_AT_byte_size,
+                                                         UINT64_MAX);
+
+              if (member_byte_offset >= parent_byte_size) {
+                if (member_array_size != 1 &&
+                    (member_array_size != 0 ||
+                     member_byte_offset > parent_byte_size)) {
+                  module_sp->ReportError(
+                      "0x%8.8" PRIx64
+                      ": DW_TAG_member '%s' refers to type 0x%8.8x"
+                      " which extends beyond the bounds of 0x%8.8" PRIx64,
+                      die.GetID(), name, encoding_form.Reference().GetOffset(),
+                      parent_die.GetID());
+                }
 
-              m_ast.SetMetadataAsUserID(field_decl, die.GetID());
+                member_clang_type =
+                    m_ast.CreateArrayType(member_array_element_type, 0, false);
+              }
+            }
+          }
 
-              layout_info.field_offsets.insert(
-                  std::make_pair(field_decl, field_bit_offset));
+          if (ClangASTContext::IsCXXClassType(member_clang_type) &&
+              !member_clang_type.GetCompleteType()) {
+            if (die.GetCU()->GetProducer() == eProducerClang)
+              module_sp->ReportError(
+                  "DWARF DIE at 0x%8.8x (class %s) has a member variable "
+                  "0x%8.8x (%s) whose type is a forward declaration, not a "
+                  "complete definition.\nTry compiling the source file "
+                  "with -fstandalone-debug",
+                  parent_die.GetOffset(), parent_die.GetName(), die.GetOffset(),
+                  name);
+            else
+              module_sp->ReportError(
+                  "DWARF DIE at 0x%8.8x (class %s) has a member variable "
+                  "0x%8.8x (%s) whose type is a forward declaration, not a "
+                  "complete definition.\nPlease file a bug against the "
+                  "compiler and include the preprocessed output for %s",
+                  parent_die.GetOffset(), parent_die.GetName(), die.GetOffset(),
+                  name, GetUnitName(parent_die).c_str());
+            // We have no choice other than to pretend that the member
+            // class is complete. If we don't do this, clang will crash
+            // when trying to layout the class. Since we provide layout
+            // assistance, all ivars in this class and other classes will
+            // be fine, this is the best we can do short of crashing.
+            if (ClangASTContext::StartTagDeclarationDefinition(
+                    member_clang_type)) {
+              ClangASTContext::CompleteTagDeclarationDefinition(
+                  member_clang_type);
             } else {
-              if (name)
-                module_sp->ReportError(
-                    "0x%8.8" PRIx64
-                    ": DW_TAG_member '%s' refers to type 0x%8.8x"
-                    " which was unable to be parsed",
-                    die.GetID(), name, encoding_form.Reference().GetOffset());
-              else
-                module_sp->ReportError(
-                    "0x%8.8" PRIx64 ": DW_TAG_member refers to type 0x%8.8x"
-                    " which was unable to be parsed",
-                    die.GetID(), encoding_form.Reference().GetOffset());
+              module_sp->ReportError(
+                  "DWARF DIE at 0x%8.8x (class %s) has a member variable "
+                  "0x%8.8x (%s) whose type claims to be a C++ class but we "
+                  "were not able to start its definition.\nPlease file a "
+                  "bug and attach the file at the start of this error "
+                  "message",
+                  parent_die.GetOffset(), parent_die.GetName(), die.GetOffset(),
+                  name);
             }
           }
 
-          if (prop_name != nullptr && member_type) {
-            clang::ObjCIvarDecl *ivar_decl = nullptr;
+          field_decl = ClangASTContext::AddFieldToRecordType(
+              class_clang_type, name, member_clang_type, accessibility,
+              bit_size);
 
-            if (field_decl) {
-              ivar_decl = clang::dyn_cast<clang::ObjCIvarDecl>(field_decl);
-              assert(ivar_decl != nullptr);
-            }
+          m_ast.SetMetadataAsUserID(field_decl, die.GetID());
+
+          layout_info.field_offsets.insert(
+              std::make_pair(field_decl, field_bit_offset));
+        } else {
+          if (name)
+            module_sp->ReportError(
+                "0x%8.8" PRIx64 ": DW_TAG_member '%s' refers to type 0x%8.8x"
+                " which was unable to be parsed",
+                die.GetID(), name, encoding_form.Reference().GetOffset());
+          else
+            module_sp->ReportError(
+                "0x%8.8" PRIx64 ": DW_TAG_member refers to type 0x%8.8x"
+                " which was unable to be parsed",
+                die.GetID(), encoding_form.Reference().GetOffset());
+        }
+      }
 
-            ClangASTMetadata metadata;
-            metadata.SetUserID(die.GetID());
-            delayed_properties.push_back(DelayedAddObjCClassProperty(
-                class_clang_type, prop_name,
-                member_type->GetLayoutCompilerType(), ivar_decl,
-                prop_setter_name, prop_getter_name, prop_attributes,
-                &metadata));
+      if (prop_name != nullptr && member_type) {
+        clang::ObjCIvarDecl *ivar_decl = nullptr;
 
-            if (ivar_decl)
-              m_ast.SetMetadataAsUserID(ivar_decl, die.GetID());
-          }
+        if (field_decl) {
+          ivar_decl = clang::dyn_cast<clang::ObjCIvarDecl>(field_decl);
+          assert(ivar_decl != nullptr);
         }
+
+        ClangASTMetadata metadata;
+        metadata.SetUserID(die.GetID());
+        delayed_properties.push_back(DelayedAddObjCClassProperty(
+            class_clang_type, prop_name, member_type->GetLayoutCompilerType(),
+            ivar_decl, prop_setter_name, prop_getter_name, prop_attributes,
+            &metadata));
+
+        if (ivar_decl)
+          m_ast.SetMetadataAsUserID(ivar_decl, die.GetID());
       }
-      ++member_idx;
-    } break;
+    }
+  }
+}
+
+bool DWARFASTParserClang::ParseChildMembers(
+    const DWARFDIE &parent_die, CompilerType &class_clang_type,
+    const LanguageType class_language,
+    std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes,
+    std::vector<int> &member_accessibilities,
+    std::vector<DWARFDIE> &member_function_dies,
+    DelayedPropertyList &delayed_properties, AccessType &default_accessibility,
+    bool &is_a_class, ClangASTImporter::LayoutInfo &layout_info) {
+  if (!parent_die)
+    return false;
+
+  BitfieldInfo last_field_info;
+
+  ModuleSP module_sp = parent_die.GetDWARF()->GetObjectFile()->GetModule();
+  ClangASTContext *ast =
+      llvm::dyn_cast_or_null<ClangASTContext>(class_clang_type.GetTypeSystem());
+  if (ast == nullptr)
+    return false;
+
+  for (DWARFDIE die = parent_die.GetFirstChild(); die.IsValid();
+       die = die.GetSibling()) {
+    dw_tag_t tag = die.Tag();
+
+    switch (tag) {
+    case DW_TAG_member:
+    case DW_TAG_APPLE_property:
+      ParseSingleMember(die, parent_die, class_clang_type, class_language,
+                        member_accessibilities, default_accessibility,
+                        delayed_properties, layout_info, last_field_info);
+      break;
 
     case DW_TAG_subprogram:
       // Let the type parsing code handle this one for us.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 982a089981d4e..c5b630e435e94 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -168,6 +168,65 @@ class DWARFASTParserClang : public DWARFASTParser {
   // Return true if this type is a declaration to a type in an external
   // module.
   lldb::ModuleSP GetModuleForType(const DWARFDIE &die);
+
+private:
+  struct BitfieldInfo {
+    uint64_t bit_size;
+    uint64_t bit_offset;
+
+    BitfieldInfo()
+        : bit_size(LLDB_INVALID_ADDRESS), bit_offset(LLDB_INVALID_ADDRESS) {}
+
+    void Clear() {
+      bit_size = LLDB_INVALID_ADDRESS;
+      bit_offset = LLDB_INVALID_ADDRESS;
+    }
+
+    bool IsValid() const {
+      return (bit_size != LLDB_INVALID_ADDRESS) &&
+             (bit_offset != LLDB_INVALID_ADDRESS);
+    }
+
+    bool NextBitfieldOffsetIsValid(const uint64_t next_bit_offset) const {
+      if (IsValid()) {
+        // This bitfield info is valid, so any subsequent bitfields must not
+        // overlap and must be at a higher bit offset than any previous bitfield
+        // + size.
+        return (bit_size + bit_offset) <= next_bit_offset;
+      } else {
+        // If the this BitfieldInfo is not valid, then any offset isOK
+        return true;
+      }
+    }
+  };
+
+  void
+  ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die,
+                    lldb_private::CompilerType &class_clang_type,
+                    const lldb::LanguageType class_language,
+                    std::vector<int> &member_accessibilities,
+                    lldb::AccessType &default_accessibility,
+                    DelayedPropertyList &delayed_properties,
+                    lldb_private::ClangASTImporter::LayoutInfo &layout_info,
+                    BitfieldInfo &last_field_info);
+
+  bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type,
+                          lldb_private::CompilerType &clang_type);
+  bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type,
+                        lldb_private::CompilerType &clang_type);
+
+  lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc,
+                                 const DWARFDIE &die,
+                                 ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc,
+                         const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParseSubroutine(const DWARFDIE &die,
+                               ParsedDWARFTypeAttributes &attrs);
+  // FIXME: attrs should be passed as a const reference.
+  lldb::TypeSP ParseArrayType(const DWARFDIE &die,
+                              ParsedDWARFTypeAttributes &attrs);
+  lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die,
+                                        const ParsedDWARFTypeAttributes &attrs);
 };
 
 /// Parsed form of all attributes that are relevant for type reconstruction.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.cpp
index eb307ce1cce1b..db8d7b3747ecd 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.cpp
@@ -81,7 +81,8 @@ const DWARFDataExtractor &DWARFContext::getOrLoadRangesData() {
 }
 
 const DWARFDataExtractor &DWARFContext::getOrLoadRngListsData() {
-  return LoadOrGetSection(eSectionTypeDWARFDebugRngLists, llvm::None,
+  return LoadOrGetSection(eSectionTypeDWARFDebugRngLists,
+                          eSectionTypeDWARFDebugRngListsDwo,
                           m_data_debug_rnglists);
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
index d1b066ffe80cb..056cf33a202f1 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
@@ -16,7 +16,6 @@
 #include "DWARFTypeUnit.h"
 #include "DWARFUnit.h"
 #include "SymbolFileDWARF.h"
-#include "lldb/Core/STLUtils.h"
 #include "lldb/lldb-private.h"
 #include "llvm/Support/Error.h"
 
@@ -24,11 +23,6 @@ namespace lldb_private {
 class DWARFContext;
 }
 
-typedef std::multimap<const char *, dw_offset_t, CStringCompareFunctionObject>
-    CStringToDIEMap;
-typedef CStringToDIEMap::iterator CStringToDIEMapIter;
-typedef CStringToDIEMap::const_iterator CStringToDIEMapConstIter;
-
 class DWARFDebugInfo {
 public:
   typedef dw_offset_t (*Callback)(SymbolFileDWARF *dwarf2Data,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 8c0fbeb4b717b..1bab4e9db6343 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -200,7 +200,7 @@ bool DWARFDebugInfoEntry::Extract(const DWARFDataExtractor &data,
   return false;
 }
 
-static DWARFRangeList GetRangesOrReportError(const DWARFUnit &unit,
+static DWARFRangeList GetRangesOrReportError(DWARFUnit &unit,
                                              const DWARFDebugInfoEntry &die,
                                              const DWARFFormValue &value) {
   llvm::Expected<DWARFRangeList> expected_ranges =
@@ -223,7 +223,7 @@ static DWARFRangeList GetRangesOrReportError(const DWARFUnit &unit,
 // Gets the valid address ranges for a given DIE by looking for a
 // DW_AT_low_pc/DW_AT_high_pc pair, DW_AT_entry_pc, or DW_AT_ranges attributes.
 bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
-    const DWARFUnit *cu, const char *&name, const char *&mangled,
+    DWARFUnit *cu, const char *&name, const char *&mangled,
     DWARFRangeList &ranges, int &decl_file, int &decl_line, int &decl_column,
     int &call_file, int &call_line, int &call_column,
     DWARFExpression *frame_base) const {
@@ -766,7 +766,7 @@ bool DWARFDebugInfoEntry::GetAttributeAddressRange(
 }
 
 size_t DWARFDebugInfoEntry::GetAttributeAddressRanges(
-    const DWARFUnit *cu, DWARFRangeList &ranges, bool check_hi_lo_pc,
+    DWARFUnit *cu, DWARFRangeList &ranges, bool check_hi_lo_pc,
     bool check_specification_or_abstract_origin) const {
   ranges.Clear();
 
@@ -1012,8 +1012,7 @@ DWARFDebugInfoEntry::GetQualifiedName(DWARFUnit *cu,
   return storage.c_str();
 }
 
-bool DWARFDebugInfoEntry::LookupAddress(const dw_addr_t address,
-                                        const DWARFUnit *cu,
+bool DWARFDebugInfoEntry::LookupAddress(const dw_addr_t address, DWARFUnit *cu,
                                         DWARFDebugInfoEntry **function_die,
                                         DWARFDebugInfoEntry **block_die) {
   bool found_address = false;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
index f3952ae9598b2..f35af6e7d498a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.h
@@ -50,7 +50,7 @@ class DWARFDebugInfoEntry {
   bool Extract(const lldb_private::DWARFDataExtractor &data,
                const DWARFUnit *cu, lldb::offset_t *offset_ptr);
 
-  bool LookupAddress(const dw_addr_t address, const DWARFUnit *cu,
+  bool LookupAddress(const dw_addr_t address, DWARFUnit *cu,
                      DWARFDebugInfoEntry **function_die,
                      DWARFDebugInfoEntry **block_die);
 
@@ -91,7 +91,7 @@ class DWARFDebugInfoEntry {
       bool check_specification_or_abstract_origin = false) const;
 
   size_t GetAttributeAddressRanges(
-      const DWARFUnit *cu, DWARFRangeList &ranges, bool check_hi_lo_pc,
+      DWARFUnit *cu, DWARFRangeList &ranges, bool check_hi_lo_pc,
       bool check_specification_or_abstract_origin = false) const;
 
   const char *GetName(const DWARFUnit *cu) const;
@@ -116,7 +116,7 @@ class DWARFDebugInfoEntry {
                 dw_attr_t attr, DWARFFormValue &form_value);
 
   bool GetDIENamesAndRanges(
-      const DWARFUnit *cu, const char *&name, const char *&mangled,
+      DWARFUnit *cu, const char *&name, const char *&mangled,
       DWARFRangeList &rangeList, int &decl_file, int &decl_line,
       int &decl_column, int &call_file, int &call_line, int &call_column,
       lldb_private::DWARFExpression *frame_base = nullptr) const;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.cpp
index 0b08fa09f9063..3b344f4509159 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.cpp
@@ -122,164 +122,3 @@ bool DWARFDebugRanges::FindRanges(const DWARFUnit *cu,
   }
   return false;
 }
-
-bool DWARFDebugRngLists::ExtractRangeList(
-    const DWARFDataExtractor &data, uint8_t addrSize,
-    lldb::offset_t *offset_ptr, std::vector<RngListEntry> &rangeList) {
-  rangeList.clear();
-
-  bool error = false;
-  while (!error) {
-    switch (data.GetU8(offset_ptr)) {
-    case DW_RLE_end_of_list:
-      return true;
-
-    case DW_RLE_start_length: {
-      dw_addr_t begin = data.GetMaxU64(offset_ptr, addrSize);
-      dw_addr_t len = data.GetULEB128(offset_ptr);
-      rangeList.push_back({DW_RLE_start_length, begin, len});
-      break;
-    }
-
-    case DW_RLE_start_end: {
-      dw_addr_t begin = data.GetMaxU64(offset_ptr, addrSize);
-      dw_addr_t end = data.GetMaxU64(offset_ptr, addrSize);
-      rangeList.push_back({DW_RLE_start_end, begin, end});
-      break;
-    }
-
-    case DW_RLE_base_address: {
-      dw_addr_t base = data.GetMaxU64(offset_ptr, addrSize);
-      rangeList.push_back({DW_RLE_base_address, base, 0});
-      break;
-    }
-
-    case DW_RLE_offset_pair: {
-      dw_addr_t begin = data.GetULEB128(offset_ptr);
-      dw_addr_t end = data.GetULEB128(offset_ptr);
-      rangeList.push_back({DW_RLE_offset_pair, begin, end});
-      break;
-    }
-
-    case DW_RLE_base_addressx: {
-      dw_addr_t base = data.GetULEB128(offset_ptr);
-      rangeList.push_back({DW_RLE_base_addressx, base, 0});
-      break;
-    }
-
-    case DW_RLE_startx_endx: {
-      dw_addr_t start = data.GetULEB128(offset_ptr);
-      dw_addr_t end = data.GetULEB128(offset_ptr);
-      rangeList.push_back({DW_RLE_startx_endx, start, end});
-      break;
-    }
-
-    case DW_RLE_startx_length: {
-      dw_addr_t start = data.GetULEB128(offset_ptr);
-      dw_addr_t length = data.GetULEB128(offset_ptr);
-      rangeList.push_back({DW_RLE_startx_length, start, length});
-      break;
-    }
-
-    default:
-      lldbassert(0 && "unknown range list entry encoding");
-      error = true;
-    }
-  }
-
-  return false;
-}
-
-static uint64_t ReadAddressFromDebugAddrSection(const DWARFUnit *cu,
-                                                uint32_t index) {
-  uint32_t index_size = cu->GetAddressByteSize();
-  dw_offset_t addr_base = cu->GetAddrBase();
-  lldb::offset_t offset = addr_base + index * index_size;
-  return cu->GetSymbolFileDWARF()
-      .GetDWARFContext()
-      .getOrLoadAddrData()
-      .GetMaxU64(&offset, index_size);
-}
-
-bool DWARFDebugRngLists::FindRanges(const DWARFUnit *cu,
-                                    dw_offset_t debug_ranges_offset,
-                                    DWARFRangeList &range_list) const {
-  range_list.Clear();
-  dw_addr_t debug_ranges_address = cu->GetRangesBase() + debug_ranges_offset;
-  auto pos = m_range_map.find(debug_ranges_address);
-  if (pos != m_range_map.end()) {
-    dw_addr_t BaseAddr = cu->GetBaseAddress();
-    for (const RngListEntry &E : pos->second) {
-      switch (E.encoding) {
-      case DW_RLE_start_length:
-        range_list.Append(DWARFRangeList::Entry(E.value0, E.value1));
-        break;
-      case DW_RLE_base_address:
-        BaseAddr = E.value0;
-        break;
-      case DW_RLE_start_end:
-        range_list.Append(DWARFRangeList::Entry(E.value0, E.value1 - E.value0));
-        break;
-      case DW_RLE_offset_pair:
-        range_list.Append(
-            DWARFRangeList::Entry(BaseAddr + E.value0, E.value1 - E.value0));
-        break;
-      case DW_RLE_base_addressx: {
-        BaseAddr = ReadAddressFromDebugAddrSection(cu, E.value0);
-        break;
-      }
-      case DW_RLE_startx_endx: {
-        dw_addr_t start = ReadAddressFromDebugAddrSection(cu, E.value0);
-        dw_addr_t end = ReadAddressFromDebugAddrSection(cu, E.value1);
-        range_list.Append(DWARFRangeList::Entry(start, end - start));
-        break;
-      }
-      case DW_RLE_startx_length: {
-        dw_addr_t start = ReadAddressFromDebugAddrSection(cu, E.value0);
-        range_list.Append(DWARFRangeList::Entry(start, E.value1));
-        break;
-      }
-      default:
-        llvm_unreachable("unexpected encoding");
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-void DWARFDebugRngLists::Extract(DWARFContext &context) {
-  const DWARFDataExtractor &data = context.getOrLoadRngListsData();
-  lldb::offset_t offset = 0;
-
-  uint64_t length = data.GetU32(&offset);
-  // FIXME: Handle DWARF64.
-  lldb::offset_t end = offset + length;
-
-  // Check version.
-  if (data.GetU16(&offset) < 5)
-    return;
-
-  uint8_t addrSize = data.GetU8(&offset);
-
-  // We do not support non-zero segment selector size.
-  if (data.GetU8(&offset) != 0) {
-    lldbassert(0 && "not implemented");
-    return;
-  }
-
-  uint32_t offsetsAmount = data.GetU32(&offset);
-  for (uint32_t i = 0; i < offsetsAmount; ++i)
-    Offsets.push_back(data.GetMaxU64(&offset, 4));
-
-  lldb::offset_t listOffset = offset;
-  std::vector<RngListEntry> rangeList;
-  while (offset < end && ExtractRangeList(data, addrSize, &offset, rangeList)) {
-    m_range_map[listOffset] = rangeList;
-    listOffset = offset;
-  }
-}
-
-uint64_t DWARFDebugRngLists::GetOffset(size_t Index) const {
-  return Offsets[Index];
-}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.h
index c398259056b3e..99ef04d7ee214 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugRanges.h
@@ -48,27 +48,4 @@ class DWARFDebugRanges final : public DWARFDebugRangesBase {
   range_map m_range_map;
 };
 
-// DWARF v5 .debug_rnglists section.
-class DWARFDebugRngLists final : public DWARFDebugRangesBase {
-  struct RngListEntry {
-    uint8_t encoding;
-    uint64_t value0;
-    uint64_t value1;
-  };
-
-public:
-  void Extract(lldb_private::DWARFContext &context) override;
-  bool FindRanges(const DWARFUnit *cu, dw_offset_t debug_ranges_offset,
-                  DWARFRangeList &range_list) const override;
-  uint64_t GetOffset(size_t Index) const;
-
-protected:
-  bool ExtractRangeList(const lldb_private::DWARFDataExtractor &data,
-                        uint8_t addrSize, lldb::offset_t *offset_ptr,
-                        std::vector<RngListEntry> &list);
-
-  std::vector<uint64_t> Offsets;
-  std::map<dw_offset_t, std::vector<RngListEntry>> m_range_map;
-};
-
 #endif // SymbolFileDWARF_DWARFDebugRanges_h_
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 9964cf4b893c4..71375da844da7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -417,8 +417,44 @@ dw_offset_t DWARFUnit::GetLineTableOffset() {
 
 void DWARFUnit::SetAddrBase(dw_addr_t addr_base) { m_addr_base = addr_base; }
 
+// Parse the rangelist table header, including the optional array of offsets
+// following it (DWARF v5 and later).
+template <typename ListTableType>
+static llvm::Expected<ListTableType>
+ParseListTableHeader(const llvm::DWARFDataExtractor &data, uint64_t offset,
+                     DwarfFormat format) {
+  // We are expected to be called with Offset 0 or pointing just past the table
+  // header. Correct Offset in the latter case so that it points to the start
+  // of the header.
+  if (offset > 0) {
+    uint64_t HeaderSize = llvm::DWARFListTableHeader::getHeaderSize(format);
+    if (offset < HeaderSize)
+      return llvm::createStringError(errc::invalid_argument,
+                                     "did not detect a valid"
+                                     " list table with base = 0x%" PRIx64 "\n",
+                                     offset);
+    offset -= HeaderSize;
+  }
+  ListTableType Table;
+  if (llvm::Error E = Table.extractHeaderAndOffsets(data, &offset))
+    return std::move(E);
+  return Table;
+}
+
 void DWARFUnit::SetRangesBase(dw_addr_t ranges_base) {
   m_ranges_base = ranges_base;
+
+  if (GetVersion() < 5)
+    return;
+
+  if (auto table_or_error = ParseListTableHeader<llvm::DWARFDebugRnglistTable>(
+          m_dwarf.GetDWARFContext().getOrLoadRngListsData().GetAsLLVM(),
+          ranges_base, DWARF32))
+    m_rnglist_table = std::move(table_or_error.get());
+  else
+    GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
+        "Failed to extract range list table at offset 0x%" PRIx64 ": %s",
+        ranges_base, toString(table_or_error.takeError()).c_str());
 }
 
 void DWARFUnit::SetStrOffsetsBase(dw_offset_t str_offsets_base) {
@@ -845,30 +881,56 @@ uint32_t DWARFUnit::GetHeaderByteSize() const {
 }
 
 llvm::Expected<DWARFRangeList>
-DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) const {
-  const DWARFDebugRangesBase *debug_ranges;
-  llvm::StringRef section;
+DWARFUnit::FindRnglistFromOffset(dw_offset_t offset) {
   if (GetVersion() <= 4) {
-    debug_ranges = m_dwarf.GetDebugRanges();
-    section = "debug_ranges";
-  } else {
-    debug_ranges = m_dwarf.GetDebugRngLists();
-    section = "debug_rnglists";
+    const DWARFDebugRangesBase *debug_ranges = m_dwarf.GetDebugRanges();
+    if (!debug_ranges)
+      return llvm::make_error<llvm::object::GenericBinaryError>(
+          "No debug_ranges section");
+    DWARFRangeList ranges;
+    debug_ranges->FindRanges(this, offset, ranges);
+    return ranges;
   }
-  if (!debug_ranges)
-    return llvm::make_error<llvm::object::GenericBinaryError>("No " + section +
-                                                              " section");
+
+  if (!m_rnglist_table)
+    return llvm::createStringError(errc::invalid_argument,
+                                   "missing or invalid range list table");
+
+  auto range_list_or_error = m_rnglist_table->findList(
+      m_dwarf.GetDWARFContext().getOrLoadRngListsData().GetAsLLVM(), offset);
+  if (!range_list_or_error)
+    return range_list_or_error.takeError();
+
+  llvm::Expected<llvm::DWARFAddressRangesVector> llvm_ranges =
+      range_list_or_error->getAbsoluteRanges(
+          llvm::object::SectionedAddress{GetBaseAddress()},
+          [&](uint32_t index) {
+            uint32_t index_size = GetAddressByteSize();
+            dw_offset_t addr_base = GetAddrBase();
+            lldb::offset_t offset = addr_base + index * index_size;
+            return llvm::object::SectionedAddress{
+                m_dwarf.GetDWARFContext().getOrLoadAddrData().GetMaxU64(
+                    &offset, index_size)};
+          });
+  if (!llvm_ranges)
+    return llvm_ranges.takeError();
 
   DWARFRangeList ranges;
-  debug_ranges->FindRanges(this, offset, ranges);
+  for (const llvm::DWARFAddressRange &llvm_range : *llvm_ranges) {
+    ranges.Append(DWARFRangeList::Entry(llvm_range.LowPC,
+                                        llvm_range.HighPC - llvm_range.LowPC));
+  }
   return ranges;
 }
 
 llvm::Expected<DWARFRangeList>
-DWARFUnit::FindRnglistFromIndex(uint32_t index) const {
-  const DWARFDebugRngLists *debug_rnglists = m_dwarf.GetDebugRngLists();
-  if (!debug_rnglists)
-    return llvm::make_error<llvm::object::GenericBinaryError>(
-        "No debug_rnglists section");
-  return FindRnglistFromOffset(debug_rnglists->GetOffset(index));
+DWARFUnit::FindRnglistFromIndex(uint32_t index) {
+  if (llvm::Optional<uint64_t> offset = GetRnglistOffset(index))
+    return FindRnglistFromOffset(*offset);
+  if (m_rnglist_table)
+    return llvm::createStringError(errc::invalid_argument,
+                                   "invalid range list table index %d", index);
+
+  return llvm::createStringError(errc::invalid_argument,
+                                 "missing or invalid range list table");
 }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 87e0de283de4b..fe64222f8f50b 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -216,12 +216,23 @@ class DWARFUnit : public lldb_private::UserID {
 
   /// Return a list of address ranges resulting from a (possibly encoded)
   /// range list starting at a given offset in the appropriate ranges section.
-  llvm::Expected<DWARFRangeList> FindRnglistFromOffset(dw_offset_t offset) const;
+  llvm::Expected<DWARFRangeList> FindRnglistFromOffset(dw_offset_t offset);
 
   /// Return a list of address ranges retrieved from an encoded range
   /// list whose offset is found via a table lookup given an index (DWARF v5
   /// and later).
-  llvm::Expected<DWARFRangeList> FindRnglistFromIndex(uint32_t index) const;
+  llvm::Expected<DWARFRangeList> FindRnglistFromIndex(uint32_t index);
+
+  /// Return a rangelist's offset based on an index. The index designates
+  /// an entry in the rangelist table's offset array and is supplied by
+  /// DW_FORM_rnglistx.
+  llvm::Optional<uint64_t> GetRnglistOffset(uint32_t Index) const {
+    if (!m_rnglist_table)
+      return llvm::None;
+    if (llvm::Optional<uint64_t> off = m_rnglist_table->getOffsetEntry(Index))
+      return *off + m_ranges_base;
+    return llvm::None;
+  }
 
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
@@ -288,6 +299,9 @@ class DWARFUnit : public lldb_private::UserID {
   dw_offset_t m_line_table_offset = DW_INVALID_OFFSET;
 
   dw_offset_t m_str_offsets_base = 0; // Value of DW_AT_str_offsets_base.
+
+  llvm::Optional<llvm::DWARFDebugRnglistTable> m_rnglist_table;
+
   const DIERef::Section m_section;
 
 private:
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index fcdff01dd20b9..9b9077a450b3a 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -198,15 +198,23 @@ GetFileByIndex(const llvm::DWARFDebugLine::Prologue &prologue, size_t idx,
   return std::move(rel_path);
 }
 
-static FileSpecList ParseSupportFilesFromPrologue(
-    const lldb::ModuleSP &module,
-    const llvm::DWARFDebugLine::Prologue &prologue, FileSpec::Style style,
-    llvm::StringRef compile_dir = {}, FileSpec first_file = {}) {
+static FileSpecList
+ParseSupportFilesFromPrologue(const lldb::ModuleSP &module,
+                              const llvm::DWARFDebugLine::Prologue &prologue,
+                              FileSpec::Style style,
+                              llvm::StringRef compile_dir = {}) {
   FileSpecList support_files;
-  support_files.Append(first_file);
+  size_t first_file = 0;
+  if (prologue.getVersion() <= 4) {
+    // File index 0 is not valid before DWARF v5. Add a dummy entry to ensure
+    // support file list indices match those we get from the debug info and line
+    // tables.
+    support_files.Append(FileSpec());
+    first_file = 1;
+  }
 
   const size_t number_of_files = prologue.FileNames.size();
-  for (size_t idx = 1; idx <= number_of_files; ++idx) {
+  for (size_t idx = first_file; idx <= number_of_files; ++idx) {
     std::string remapped_file;
     if (auto file_path = GetFileByIndex(prologue, idx, compile_dir, style))
       if (!module->RemapSourceFile(llvm::StringRef(*file_path), remapped_file))
@@ -676,21 +684,6 @@ DWARFDebugRanges *SymbolFileDWARF::GetDebugRanges() {
   return m_ranges.get();
 }
 
-DWARFDebugRngLists *SymbolFileDWARF::GetDebugRngLists() {
-  if (!m_rnglists) {
-    static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
-    Timer scoped_timer(func_cat, "%s this = %p", LLVM_PRETTY_FUNCTION,
-                       static_cast<void *>(this));
-
-    if (m_context.getOrLoadRngListsData().GetByteSize() > 0)
-      m_rnglists.reset(new DWARFDebugRngLists());
-
-    if (m_rnglists)
-      m_rnglists->Extract(m_context);
-  }
-  return m_rnglists.get();
-}
-
 lldb::CompUnitSP SymbolFileDWARF::ParseCompileUnit(DWARFCompileUnit &dwarf_cu) {
   CompUnitSP cu_sp;
   CompileUnit *comp_unit = (CompileUnit *)dwarf_cu.GetUserData();
@@ -1046,7 +1039,7 @@ bool SymbolFileDWARF::ParseLineTable(CompileUnit &comp_unit) {
 
   comp_unit.SetSupportFiles(ParseSupportFilesFromPrologue(
       comp_unit.GetModule(), line_table->Prologue, dwarf_cu->GetPathStyle(),
-      dwarf_cu->GetCompilationDirectory().GetCString(), FileSpec(comp_unit)));
+      dwarf_cu->GetCompilationDirectory().GetCString()));
 
   return true;
 }
@@ -1949,9 +1942,8 @@ uint32_t SymbolFileDWARF::ResolveSymbolContext(const FileSpec &file_spec,
       if (!dc_cu)
         continue;
 
-      const bool full_match = (bool)file_spec.GetDirectory();
       bool file_spec_matches_cu_file_spec =
-          FileSpec::Equal(file_spec, *dc_cu, full_match);
+          FileSpec::Match(file_spec, dc_cu->GetPrimaryFile());
       if (check_inlines || file_spec_matches_cu_file_spec) {
         SymbolContext sc(m_objfile_sp->GetModule());
         sc.comp_unit = dc_cu;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 9e4e4279eec9f..35b18f4b02b35 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -47,7 +47,6 @@ class DWARFDebugInfo;
 class DWARFDebugInfoEntry;
 class DWARFDebugLine;
 class DWARFDebugRanges;
-class DWARFDebugRngLists;
 class DWARFDeclContext;
 class DWARFFormValue;
 class DWARFTypeUnit;
@@ -236,7 +235,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
   const DWARFDebugInfo *DebugInfo() const;
 
   DWARFDebugRanges *GetDebugRanges();
-  DWARFDebugRngLists *GetDebugRngLists();
 
   const lldb_private::DWARFDataExtractor &DebugLocData();
 
@@ -499,7 +497,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
   typedef llvm::StringMap<DIERefSet> NameToOffsetMap;
   NameToOffsetMap m_function_scope_qualified_name_map;
   std::unique_ptr<DWARFDebugRanges> m_ranges;
-  std::unique_ptr<DWARFDebugRngLists> m_rnglists;
   UniqueDWARFASTTypeMap m_unique_ast_type_map;
   DIEToTypePtr m_die_to_type;
   DIEToVariableSP m_die_to_variable_sp;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
index dbdbf49929412..cce666a222d07 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.cpp
@@ -604,7 +604,7 @@ SymbolFileDWARFDebugMap::CompileUnitInfo *
 SymbolFileDWARFDebugMap::GetCompUnitInfo(const CompileUnit &comp_unit) {
   const uint32_t cu_count = GetNumCompileUnits();
   for (uint32_t i = 0; i < cu_count; ++i) {
-    if (comp_unit == m_compile_unit_infos[i].compile_unit_sp.get())
+    if (&comp_unit == m_compile_unit_infos[i].compile_unit_sp.get())
       return &m_compile_unit_infos[i];
   }
   return nullptr;
@@ -812,12 +812,8 @@ uint32_t SymbolFileDWARFDebugMap::ResolveSymbolContext(
 
     if (!resolve) {
       FileSpec so_file_spec;
-      if (GetFileSpecForSO(i, so_file_spec)) {
-        // Match the full path if the incoming file_spec has a directory (not
-        // just a basename)
-        const bool full_match = (bool)file_spec.GetDirectory();
-        resolve = FileSpec::Equal(file_spec, so_file_spec, full_match);
-      }
+      if (GetFileSpecForSO(i, so_file_spec))
+        resolve = FileSpec::Match(file_spec, so_file_spec);
     }
     if (resolve) {
       SymbolFileDWARF *oso_dwarf = GetSymbolFileByOSOIndex(i);
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index f0308e23c9d77..22d1b08ea9e7e 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -1110,9 +1110,7 @@ bool SymbolFileNativePDB::ParseLineTable(CompileUnit &comp_unit) {
       // LLDB wants the index of the file in the list of support files.
       auto fn_iter = llvm::find(cci->m_file_list, *efn);
       lldbassert(fn_iter != cci->m_file_list.end());
-      // LLDB support file indices are 1-based.
-      uint32_t file_index =
-          1 + std::distance(cci->m_file_list.begin(), fn_iter);
+      uint32_t file_index = std::distance(cci->m_file_list.begin(), fn_iter);
 
       std::unique_ptr<LineSequence> sequence(
           line_table->CreateLineSequenceContainer());
@@ -1155,14 +1153,6 @@ bool SymbolFileNativePDB::ParseSupportFiles(CompileUnit &comp_unit,
     FileSpec spec(f, style);
     support_files.Append(spec);
   }
-
-  llvm::SmallString<64> main_source_file =
-      m_index->compilands().GetMainSourceFile(*cci);
-  FileSpec::Style style = main_source_file.startswith("/")
-                              ? FileSpec::Style::posix
-                              : FileSpec::Style::windows;
-  FileSpec spec(main_source_file, style);
-  support_files.Insert(0, spec);
   return true;
 }
 
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index e7bc730ca38b8..b3e06fdd1a5db 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -371,10 +371,6 @@ bool SymbolFilePDB::ParseSupportFiles(
     support_files.AppendIfUnique(spec);
   }
 
-  // LLDB uses the DWARF-like file numeration (one based),
-  // the zeroth file is the compile unit itself
-  support_files.Insert(0, comp_unit);
-
   return true;
 }
 
@@ -1780,7 +1776,6 @@ bool SymbolFilePDB::ParseCompileUnitLineTable(CompileUnit &comp_unit,
   auto line_table = std::make_unique<LineTable>(&comp_unit);
 
   // Find contributions to `compiland` from all source and header files.
-  std::string path = comp_unit.GetPath();
   auto files = m_session_up->getSourceFilesForCompiland(*compiland_up);
   if (!files)
     return false;
@@ -1882,9 +1877,7 @@ void SymbolFilePDB::BuildSupportFileIdToSupportFileIndexMap(
   if (!source_files)
     return;
 
-  // LLDB uses the DWARF-like file numeration (one based)
-  int index = 1;
-
+  int index = 0;
   while (auto file = source_files->getNext()) {
     uint32_t source_id = file->getUniqueId();
     index_map[source_id] = index++;
diff --git a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
index e61e5763fabb9..d4d7a8937c127 100644
--- a/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
+++ b/lldb/source/Plugins/SymbolVendor/ELF/SymbolVendorELF.cpp
@@ -119,14 +119,17 @@ SymbolVendorELF::CreateInstance(const lldb::ModuleSP &module_sp,
   SectionList *objfile_section_list = dsym_objfile_sp->GetSectionList();
 
   static const SectionType g_sections[] = {
-      eSectionTypeDWARFDebugAbbrev,   eSectionTypeDWARFDebugAddr,
-      eSectionTypeDWARFDebugAranges,  eSectionTypeDWARFDebugCuIndex,
-      eSectionTypeDWARFDebugFrame,    eSectionTypeDWARFDebugInfo,
-      eSectionTypeDWARFDebugLine,     eSectionTypeDWARFDebugLoc,
-      eSectionTypeDWARFDebugMacInfo,  eSectionTypeDWARFDebugPubNames,
-      eSectionTypeDWARFDebugPubTypes, eSectionTypeDWARFDebugRanges,
-      eSectionTypeDWARFDebugStr,      eSectionTypeDWARFDebugStrOffsets,
-      eSectionTypeELFSymbolTable,     eSectionTypeDWARFGNUDebugAltLink,
+      eSectionTypeDWARFDebugAbbrev,     eSectionTypeDWARFDebugAddr,
+      eSectionTypeDWARFDebugAranges,    eSectionTypeDWARFDebugCuIndex,
+      eSectionTypeDWARFDebugFrame,      eSectionTypeDWARFDebugInfo,
+      eSectionTypeDWARFDebugLine,       eSectionTypeDWARFDebugLineStr,
+      eSectionTypeDWARFDebugLoc,        eSectionTypeDWARFDebugLocLists,
+      eSectionTypeDWARFDebugMacInfo,    eSectionTypeDWARFDebugMacro,
+      eSectionTypeDWARFDebugNames,      eSectionTypeDWARFDebugPubNames,
+      eSectionTypeDWARFDebugPubTypes,   eSectionTypeDWARFDebugRanges,
+      eSectionTypeDWARFDebugRngLists,   eSectionTypeDWARFDebugStr,
+      eSectionTypeDWARFDebugStrOffsets, eSectionTypeDWARFDebugTypes,
+      eSectionTypeELFSymbolTable,       eSectionTypeDWARFGNUDebugAltLink,
   };
   for (SectionType section_type : g_sections) {
     if (SectionSP section_sp =
diff --git a/lldb/source/Symbol/ClangASTContext.cpp b/lldb/source/Symbol/ClangASTContext.cpp
index 244ac8ce5ff87..e6435a2611741 100644
--- a/lldb/source/Symbol/ClangASTContext.cpp
+++ b/lldb/source/Symbol/ClangASTContext.cpp
@@ -15,24 +15,6 @@
 #include <string>
 #include <vector>
 
-
-// Clang headers like to use NDEBUG inside of them to enable/disable debug
-// related features using "#ifndef NDEBUG" preprocessor blocks to do one thing
-// or another. This is bad because it means that if clang was built in release
-// mode, it assumes that you are building in release mode which is not always
-// the case. You can end up with functions that are defined as empty in header
-// files when NDEBUG is not defined, and this can cause link errors with the
-// clang .a files that you have since you might be missing functions in the .a
-// file. So we have to define NDEBUG when including clang headers to avoid any
-// mismatches. This is covered by rdar://problem/8691220
-
-#if !defined(NDEBUG) && !defined(LLVM_NDEBUG_OFF)
-#define LLDB_DEFINED_NDEBUG_FOR_CLANG
-#define NDEBUG
-// Need to include assert.h so it is as clang would expect it to be (disabled)
-#include <assert.h>
-#endif
-
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTImporter.h"
 #include "clang/AST/Attr.h"
@@ -54,13 +36,6 @@
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/Sema/Sema.h"
 
-#ifdef LLDB_DEFINED_NDEBUG_FOR_CLANG
-#undef NDEBUG
-#undef LLDB_DEFINED_NDEBUG_FOR_CLANG
-// Need to re-include assert.h so it is as _we_ would expect it to be (enabled)
-#include <assert.h>
-#endif
-
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 
@@ -337,6 +312,8 @@ static ClangASTMap &GetASTMap() {
   return *g_map_ptr;
 }
 
+char ClangASTContext::ID;
+
 bool ClangASTContext::IsOperator(llvm::StringRef name,
                                  clang::OverloadedOperatorKind &op_kind) {
   // All operators have to start with "operator".
@@ -522,8 +499,7 @@ static void ParseLangArgs(LangOptions &Opts, InputKind IK, const char *triple) {
   Opts.NoInlineDefine = !Opt;
 }
 
-ClangASTContext::ClangASTContext(llvm::StringRef target_triple)
-    : TypeSystem(TypeSystem::eKindClang) {
+ClangASTContext::ClangASTContext(llvm::StringRef target_triple) {
   if (!target_triple.empty())
     SetTargetTriple(target_triple);
   // The caller didn't pass an ASTContext so create a new one for this
@@ -531,16 +507,14 @@ ClangASTContext::ClangASTContext(llvm::StringRef target_triple)
   CreateASTContext();
 }
 
-ClangASTContext::ClangASTContext(ArchSpec arch)
-    : TypeSystem(TypeSystem::eKindClang) {
+ClangASTContext::ClangASTContext(ArchSpec arch) {
   SetTargetTriple(arch.GetTriple().str());
   // The caller didn't pass an ASTContext so create a new one for this
   // ClangASTContext.
   CreateASTContext();
 }
 
-ClangASTContext::ClangASTContext(ASTContext &existing_ctxt)
-  : TypeSystem(TypeSystem::eKindClang) {
+ClangASTContext::ClangASTContext(ASTContext &existing_ctxt) {
   SetTargetTriple(existing_ctxt.getTargetInfo().getTriple().str());
 
   m_ast_up.reset(&existing_ctxt);
@@ -563,47 +537,47 @@ uint32_t ClangASTContext::GetPluginVersion() { return 1; }
 lldb::TypeSystemSP ClangASTContext::CreateInstance(lldb::LanguageType language,
                                                    lldb_private::Module *module,
                                                    Target *target) {
-  if (ClangASTContextSupportsLanguage(language)) {
-    ArchSpec arch;
-    if (module)
-      arch = module->GetArchitecture();
-    else if (target)
-      arch = target->GetArchitecture();
-
-    if (arch.IsValid()) {
-      ArchSpec fixed_arch = arch;
-      // LLVM wants this to be set to iOS or MacOSX; if we're working on
-      // a bare-boards type image, change the triple for llvm's benefit.
-      if (fixed_arch.GetTriple().getVendor() == llvm::Triple::Apple &&
-          fixed_arch.GetTriple().getOS() == llvm::Triple::UnknownOS) {
-        if (fixed_arch.GetTriple().getArch() == llvm::Triple::arm ||
-            fixed_arch.GetTriple().getArch() == llvm::Triple::aarch64 ||
-            fixed_arch.GetTriple().getArch() == llvm::Triple::aarch64_32 ||
-            fixed_arch.GetTriple().getArch() == llvm::Triple::thumb) {
-          fixed_arch.GetTriple().setOS(llvm::Triple::IOS);
-        } else {
-          fixed_arch.GetTriple().setOS(llvm::Triple::MacOSX);
-        }
-      }
-
-      if (module) {
-        std::shared_ptr<ClangASTContext> ast_sp(
-            new ClangASTContext(fixed_arch));
-        return ast_sp;
-      } else if (target && target->IsValid()) {
-        std::shared_ptr<ClangASTContextForExpressions> ast_sp(
-            new ClangASTContextForExpressions(*target, fixed_arch));
-        ast_sp->m_scratch_ast_source_up.reset(
-            new ClangASTSource(target->shared_from_this()));
-        lldbassert(ast_sp->getFileManager());
-        ast_sp->m_scratch_ast_source_up->InstallASTContext(
-            *ast_sp->getASTContext(), *ast_sp->getFileManager(), true);
-        llvm::IntrusiveRefCntPtr<clang::ExternalASTSource> proxy_ast_source(
-            ast_sp->m_scratch_ast_source_up->CreateProxy());
-        ast_sp->SetExternalSource(proxy_ast_source);
-        return ast_sp;
-      }
-    }
+  if (!ClangASTContextSupportsLanguage(language))
+    return lldb::TypeSystemSP();
+  ArchSpec arch;
+  if (module)
+    arch = module->GetArchitecture();
+  else if (target)
+    arch = target->GetArchitecture();
+
+  if (!arch.IsValid())
+    return lldb::TypeSystemSP();
+
+  ArchSpec fixed_arch = arch;
+  // LLVM wants this to be set to iOS or MacOSX; if we're working on
+  // a bare-boards type image, change the triple for llvm's benefit.
+  if (fixed_arch.GetTriple().getVendor() == llvm::Triple::Apple &&
+      fixed_arch.GetTriple().getOS() == llvm::Triple::UnknownOS) {
+    if (fixed_arch.GetTriple().getArch() == llvm::Triple::arm ||
+        fixed_arch.GetTriple().getArch() == llvm::Triple::aarch64 ||
+        fixed_arch.GetTriple().getArch() == llvm::Triple::aarch64_32 ||
+        fixed_arch.GetTriple().getArch() == llvm::Triple::thumb) {
+      fixed_arch.GetTriple().setOS(llvm::Triple::IOS);
+    } else {
+      fixed_arch.GetTriple().setOS(llvm::Triple::MacOSX);
+    }
+  }
+
+  if (module) {
+    std::shared_ptr<ClangASTContext> ast_sp(new ClangASTContext(fixed_arch));
+    return ast_sp;
+  } else if (target && target->IsValid()) {
+    std::shared_ptr<ClangASTContextForExpressions> ast_sp(
+        new ClangASTContextForExpressions(*target, fixed_arch));
+    ast_sp->m_scratch_ast_source_up.reset(
+        new ClangASTSource(target->shared_from_this()));
+    lldbassert(ast_sp->getFileManager());
+    ast_sp->m_scratch_ast_source_up->InstallASTContext(
+        *ast_sp, *ast_sp->getFileManager(), true);
+    llvm::IntrusiveRefCntPtr<clang::ExternalASTSource> proxy_ast_source(
+        ast_sp->m_scratch_ast_source_up->CreateProxy());
+    ast_sp->SetExternalSource(proxy_ast_source);
+    return ast_sp;
   }
   return lldb::TypeSystemSP();
 }
@@ -844,77 +818,62 @@ static inline bool QualTypeMatchesBitSize(const uint64_t bit_size,
 CompilerType
 ClangASTContext::GetBuiltinTypeForEncodingAndBitSize(Encoding encoding,
                                                      size_t bit_size) {
-  return ClangASTContext::GetBuiltinTypeForEncodingAndBitSize(
-      getASTContext(), encoding, bit_size);
-}
-
-CompilerType ClangASTContext::GetBuiltinTypeForEncodingAndBitSize(
-    ASTContext *ast, Encoding encoding, uint32_t bit_size) {
-  auto *clang_ast_context = ClangASTContext::GetASTContext(ast);
+  ASTContext *ast = this->getASTContext();
   if (!ast)
     return CompilerType();
   switch (encoding) {
   case eEncodingInvalid:
     if (QualTypeMatchesBitSize(bit_size, ast, ast->VoidPtrTy))
-      return CompilerType(clang_ast_context, ast->VoidPtrTy.getAsOpaquePtr());
+      return CompilerType(this, ast->VoidPtrTy.getAsOpaquePtr());
     break;
 
   case eEncodingUint:
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedCharTy))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedCharTy.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedCharTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedShortTy))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedShortTy.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedShortTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedIntTy))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedIntTy.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedIntTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedLongTy))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedLongTy.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedLongTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedLongLongTy))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedLongLongTy.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedLongLongTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->UnsignedInt128Ty))
-      return CompilerType(clang_ast_context,
-                          ast->UnsignedInt128Ty.getAsOpaquePtr());
+      return CompilerType(this, ast->UnsignedInt128Ty.getAsOpaquePtr());
     break;
 
   case eEncodingSint:
     if (QualTypeMatchesBitSize(bit_size, ast, ast->SignedCharTy))
-      return CompilerType(clang_ast_context,
-                          ast->SignedCharTy.getAsOpaquePtr());
+      return CompilerType(this, ast->SignedCharTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->ShortTy))
-      return CompilerType(clang_ast_context, ast->ShortTy.getAsOpaquePtr());
+      return CompilerType(this, ast->ShortTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->IntTy))
-      return CompilerType(clang_ast_context, ast->IntTy.getAsOpaquePtr());
+      return CompilerType(this, ast->IntTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->LongTy))
-      return CompilerType(clang_ast_context, ast->LongTy.getAsOpaquePtr());
+      return CompilerType(this, ast->LongTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->LongLongTy))
-      return CompilerType(clang_ast_context, ast->LongLongTy.getAsOpaquePtr());
+      return CompilerType(this, ast->LongLongTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->Int128Ty))
-      return CompilerType(clang_ast_context, ast->Int128Ty.getAsOpaquePtr());
+      return CompilerType(this, ast->Int128Ty.getAsOpaquePtr());
     break;
 
   case eEncodingIEEE754:
     if (QualTypeMatchesBitSize(bit_size, ast, ast->FloatTy))
-      return CompilerType(clang_ast_context, ast->FloatTy.getAsOpaquePtr());
+      return CompilerType(this, ast->FloatTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->DoubleTy))
-      return CompilerType(clang_ast_context, ast->DoubleTy.getAsOpaquePtr());
+      return CompilerType(this, ast->DoubleTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->LongDoubleTy))
-      return CompilerType(clang_ast_context,
-                          ast->LongDoubleTy.getAsOpaquePtr());
+      return CompilerType(this, ast->LongDoubleTy.getAsOpaquePtr());
     if (QualTypeMatchesBitSize(bit_size, ast, ast->HalfTy))
-      return CompilerType(clang_ast_context, ast->HalfTy.getAsOpaquePtr());
+      return CompilerType(this, ast->HalfTy.getAsOpaquePtr());
     break;
 
   case eEncodingVector:
     // Sanity check that bit_size is a multiple of 8's.
     if (bit_size && !(bit_size & 0x7u))
       return CompilerType(
-          clang_ast_context,
-          ast->getExtVectorType(ast->UnsignedCharTy, bit_size / 8)
-              .getAsOpaquePtr());
+          this, ast->getExtVectorType(ast->UnsignedCharTy, bit_size / 8)
+                    .getAsOpaquePtr());
     break;
   }
 
@@ -987,11 +946,6 @@ ClangASTContext::GetBasicTypeEnumeration(ConstString name) {
   return eBasicTypeInvalid;
 }
 
-CompilerType ClangASTContext::GetBasicType(ConstString name) {
-  lldb::BasicType basic_type = ClangASTContext::GetBasicTypeEnumeration(name);
-  return GetBasicType(basic_type);
-}
-
 uint32_t ClangASTContext::GetPointerByteSize() {
   if (m_pointer_byte_size == 0)
     if (auto size = GetBasicType(lldb::eBasicTypeVoid)
@@ -10200,16 +10154,20 @@ bool ClangASTContext::DeclContextIsContainedInLookup(
   return false;
 }
 
+static bool IsClangDeclContext(const CompilerDeclContext &dc) {
+  return dc.IsValid() && isa<ClangASTContext>(dc.GetTypeSystem());
+}
+
 clang::DeclContext *
 ClangASTContext::DeclContextGetAsDeclContext(const CompilerDeclContext &dc) {
-  if (dc.IsClang())
+  if (IsClangDeclContext(dc))
     return (clang::DeclContext *)dc.GetOpaqueDeclContext();
   return nullptr;
 }
 
 ObjCMethodDecl *
 ClangASTContext::DeclContextGetAsObjCMethodDecl(const CompilerDeclContext &dc) {
-  if (dc.IsClang())
+  if (IsClangDeclContext(dc))
     return llvm::dyn_cast<clang::ObjCMethodDecl>(
         (clang::DeclContext *)dc.GetOpaqueDeclContext());
   return nullptr;
@@ -10217,7 +10175,7 @@ ClangASTContext::DeclContextGetAsObjCMethodDecl(const CompilerDeclContext &dc) {
 
 CXXMethodDecl *
 ClangASTContext::DeclContextGetAsCXXMethodDecl(const CompilerDeclContext &dc) {
-  if (dc.IsClang())
+  if (IsClangDeclContext(dc))
     return llvm::dyn_cast<clang::CXXMethodDecl>(
         (clang::DeclContext *)dc.GetOpaqueDeclContext());
   return nullptr;
@@ -10225,7 +10183,7 @@ ClangASTContext::DeclContextGetAsCXXMethodDecl(const CompilerDeclContext &dc) {
 
 clang::FunctionDecl *
 ClangASTContext::DeclContextGetAsFunctionDecl(const CompilerDeclContext &dc) {
-  if (dc.IsClang())
+  if (IsClangDeclContext(dc))
     return llvm::dyn_cast<clang::FunctionDecl>(
         (clang::DeclContext *)dc.GetOpaqueDeclContext());
   return nullptr;
@@ -10233,7 +10191,7 @@ ClangASTContext::DeclContextGetAsFunctionDecl(const CompilerDeclContext &dc) {
 
 clang::NamespaceDecl *
 ClangASTContext::DeclContextGetAsNamespaceDecl(const CompilerDeclContext &dc) {
-  if (dc.IsClang())
+  if (IsClangDeclContext(dc))
     return llvm::dyn_cast<clang::NamespaceDecl>(
         (clang::DeclContext *)dc.GetOpaqueDeclContext());
   return nullptr;
diff --git a/lldb/source/Symbol/CompileUnit.cpp b/lldb/source/Symbol/CompileUnit.cpp
index b37636c3bafc1..b05036e27fcf2 100644
--- a/lldb/source/Symbol/CompileUnit.cpp
+++ b/lldb/source/Symbol/CompileUnit.cpp
@@ -21,30 +21,21 @@ CompileUnit::CompileUnit(const lldb::ModuleSP &module_sp, void *user_data,
                          const char *pathname, const lldb::user_id_t cu_sym_id,
                          lldb::LanguageType language,
                          lldb_private::LazyBool is_optimized)
-    : ModuleChild(module_sp), FileSpec(pathname), UserID(cu_sym_id),
-      m_user_data(user_data), m_language(language), m_flags(0),
-      m_support_files(), m_line_table_up(), m_variables(),
-      m_is_optimized(is_optimized) {
-  if (language != eLanguageTypeUnknown)
-    m_flags.Set(flagsParsedLanguage);
-  assert(module_sp);
-}
+    : CompileUnit(module_sp, user_data, FileSpec(pathname), cu_sym_id, language,
+                  is_optimized) {}
 
 CompileUnit::CompileUnit(const lldb::ModuleSP &module_sp, void *user_data,
                          const FileSpec &fspec, const lldb::user_id_t cu_sym_id,
                          lldb::LanguageType language,
                          lldb_private::LazyBool is_optimized)
-    : ModuleChild(module_sp), FileSpec(fspec), UserID(cu_sym_id),
-      m_user_data(user_data), m_language(language), m_flags(0),
-      m_support_files(), m_line_table_up(), m_variables(),
+    : ModuleChild(module_sp), UserID(cu_sym_id), m_user_data(user_data),
+      m_language(language), m_flags(0), m_file_spec(fspec),
       m_is_optimized(is_optimized) {
   if (language != eLanguageTypeUnknown)
     m_flags.Set(flagsParsedLanguage);
   assert(module_sp);
 }
 
-CompileUnit::~CompileUnit() {}
-
 void CompileUnit::CalculateSymbolContext(SymbolContext *sc) {
   sc->comp_unit = this;
   GetModule()->CalculateSymbolContext(sc);
@@ -63,7 +54,7 @@ void CompileUnit::GetDescription(Stream *s,
                                  lldb::DescriptionLevel level) const {
   const char *language = Language::GetNameForLanguageType(m_language);
   *s << "id = " << (const UserID &)*this << ", file = \""
-     << (const FileSpec &)*this << "\", language = \"" << language << '"';
+     << this->GetPrimaryFile() << "\", language = \"" << language << '"';
 }
 
 void CompileUnit::ForeachFunction(
@@ -117,8 +108,7 @@ void CompileUnit::Dump(Stream *s, bool show_context) const {
   s->Printf("%p: ", static_cast<const void *>(this));
   s->Indent();
   *s << "CompileUnit" << static_cast<const UserID &>(*this) << ", language = \""
-     << language << "\", file = '" << static_cast<const FileSpec &>(*this)
-     << "'\n";
+     << language << "\", file = '" << GetPrimaryFile() << "'\n";
 
   //  m_types.Dump(s);
 
@@ -217,53 +207,50 @@ VariableListSP CompileUnit::GetVariableList(bool can_create) {
   return m_variables;
 }
 
+std::vector<uint32_t> FindFileIndexes(const FileSpecList &files, const FileSpec &file) {
+  std::vector<uint32_t> result;
+  uint32_t idx = -1;
+  while ((idx = files.FindFileIndex(idx + 1, file, /*full=*/true)) !=
+         UINT32_MAX)
+    result.push_back(idx);
+  return result;
+}
+
 uint32_t CompileUnit::FindLineEntry(uint32_t start_idx, uint32_t line,
                                     const FileSpec *file_spec_ptr, bool exact,
                                     LineEntry *line_entry_ptr) {
-  uint32_t file_idx = 0;
+  if (!file_spec_ptr)
+    file_spec_ptr = &GetPrimaryFile();
+  std::vector<uint32_t> file_indexes = FindFileIndexes(GetSupportFiles(), *file_spec_ptr);
+  if (file_indexes.empty())
+    return UINT32_MAX;
 
-  if (file_spec_ptr) {
-    file_idx = GetSupportFiles().FindFileIndex(1, *file_spec_ptr, true);
-    if (file_idx == UINT32_MAX)
-      return UINT32_MAX;
-  } else {
-    // All the line table entries actually point to the version of the Compile
-    // Unit that is in the support files (the one at 0 was artificially added.)
-    // So prefer the one further on in the support files if it exists...
-    const FileSpecList &support_files = GetSupportFiles();
-    const bool full = true;
-    file_idx = support_files.FindFileIndex(
-        1, support_files.GetFileSpecAtIndex(0), full);
-    if (file_idx == UINT32_MAX)
-      file_idx = 0;
-  }
   LineTable *line_table = GetLineTable();
   if (line_table)
-    return line_table->FindLineEntryIndexByFileIndex(start_idx, file_idx, line,
-                                                     exact, line_entry_ptr);
+    return line_table->FindLineEntryIndexByFileIndex(
+        start_idx, file_indexes, line, exact, line_entry_ptr);
   return UINT32_MAX;
 }
 
-uint32_t CompileUnit::ResolveSymbolContext(const FileSpec &file_spec,
-                                           uint32_t line, bool check_inlines,
-                                           bool exact,
-                                           SymbolContextItem resolve_scope,
-                                           SymbolContextList &sc_list) {
+void CompileUnit::ResolveSymbolContext(const FileSpec &file_spec,
+                                       uint32_t line, bool check_inlines,
+                                       bool exact,
+                                       SymbolContextItem resolve_scope,
+                                       SymbolContextList &sc_list) {
   // First find all of the file indexes that match our "file_spec". If
   // "file_spec" has an empty directory, then only compare the basenames when
   // finding file indexes
   std::vector<uint32_t> file_indexes;
-  const bool full_match = (bool)file_spec.GetDirectory();
   bool file_spec_matches_cu_file_spec =
-      FileSpec::Equal(file_spec, *this, full_match);
+      FileSpec::Match(file_spec, this->GetPrimaryFile());
 
   // If we are not looking for inlined functions and our file spec doesn't
   // match then we are done...
   if (!file_spec_matches_cu_file_spec && !check_inlines)
-    return 0;
+    return;
 
   uint32_t file_idx =
-      GetSupportFiles().FindFileIndex(1, file_spec, true);
+      GetSupportFiles().FindFileIndex(0, file_spec, true);
   while (file_idx != UINT32_MAX) {
     file_indexes.push_back(file_idx);
     file_idx = GetSupportFiles().FindFileIndex(file_idx + 1, file_spec, true);
@@ -271,84 +258,67 @@ uint32_t CompileUnit::ResolveSymbolContext(const FileSpec &file_spec,
 
   const size_t num_file_indexes = file_indexes.size();
   if (num_file_indexes == 0)
-    return 0;
-
-  const uint32_t prev_size = sc_list.GetSize();
+    return;
 
   SymbolContext sc(GetModule());
   sc.comp_unit = this;
 
-  if (line != 0) {
-    LineTable *line_table = sc.comp_unit->GetLineTable();
-
-    if (line_table != nullptr) {
-      uint32_t found_line;
-      uint32_t line_idx;
-
-      if (num_file_indexes == 1) {
-        // We only have a single support file that matches, so use the line
-        // table function that searches for a line entries that match a single
-        // support file index
-        LineEntry line_entry;
-        line_idx = line_table->FindLineEntryIndexByFileIndex(
-            0, file_indexes.front(), line, exact, &line_entry);
-
-        // If "exact == true", then "found_line" will be the same as "line". If
-        // "exact == false", the "found_line" will be the closest line entry
-        // with a line number greater than "line" and we will use this for our
-        // subsequent line exact matches below.
-        found_line = line_entry.line;
-
-        while (line_idx != UINT32_MAX) {
-          // If they only asked for the line entry, then we're done, we can
-          // just copy that over. But if they wanted more than just the line
-          // number, fill it in.
-          if (resolve_scope == eSymbolContextLineEntry) {
-            sc.line_entry = line_entry;
-          } else {
-            line_entry.range.GetBaseAddress().CalculateSymbolContext(
-                &sc, resolve_scope);
-          }
-
-          sc_list.Append(sc);
-          line_idx = line_table->FindLineEntryIndexByFileIndex(
-              line_idx + 1, file_indexes.front(), found_line, true,
-              &line_entry);
-        }
-      } else {
-        // We found multiple support files that match "file_spec" so use the
-        // line table function that searches for a line entries that match a
-        // multiple support file indexes.
-        LineEntry line_entry;
-        line_idx = line_table->FindLineEntryIndexByFileIndex(
-            0, file_indexes, line, exact, &line_entry);
-
-        // If "exact == true", then "found_line" will be the same as "line". If
-        // "exact == false", the "found_line" will be the closest line entry
-        // with a line number greater than "line" and we will use this for our
-        // subsequent line exact matches below.
-        found_line = line_entry.line;
-
-        while (line_idx != UINT32_MAX) {
-          if (resolve_scope == eSymbolContextLineEntry) {
-            sc.line_entry = line_entry;
-          } else {
-            line_entry.range.GetBaseAddress().CalculateSymbolContext(
-                &sc, resolve_scope);
-          }
-
-          sc_list.Append(sc);
-          line_idx = line_table->FindLineEntryIndexByFileIndex(
-              line_idx + 1, file_indexes, found_line, true, &line_entry);
-        }
-      }
+  if (line == 0) {
+    if (file_spec_matches_cu_file_spec && !check_inlines) {
+      // only append the context if we aren't looking for inline call sites by
+      // file and line and if the file spec matches that of the compile unit
+      sc_list.Append(sc);
     }
-  } else if (file_spec_matches_cu_file_spec && !check_inlines) {
-    // only append the context if we aren't looking for inline call sites by
-    // file and line and if the file spec matches that of the compile unit
+    return;
+  }
+
+  LineTable *line_table = sc.comp_unit->GetLineTable();
+
+  if (line_table == nullptr)
+    return;
+
+  uint32_t line_idx;
+  LineEntry line_entry;
+
+  if (num_file_indexes == 1) {
+    // We only have a single support file that matches, so use the line
+    // table function that searches for a line entries that match a single
+    // support file index
+    line_idx = line_table->FindLineEntryIndexByFileIndex(
+        0, file_indexes.front(), line, exact, &line_entry);
+  } else {
+    // We found multiple support files that match "file_spec" so use the
+    // line table function that searches for a line entries that match a
+    // multiple support file indexes.
+    line_idx = line_table->FindLineEntryIndexByFileIndex(0, file_indexes, line,
+                                                         exact, &line_entry);
+  }
+  
+  // If "exact == true", then "found_line" will be the same as "line". If
+  // "exact == false", the "found_line" will be the closest line entry
+  // with a line number greater than "line" and we will use this for our
+  // subsequent line exact matches below.
+  uint32_t found_line = line_entry.line;
+  
+  while (line_idx != UINT32_MAX) {
+    // If they only asked for the line entry, then we're done, we can
+    // just copy that over. But if they wanted more than just the line
+    // number, fill it in.
+    if (resolve_scope == eSymbolContextLineEntry) {
+      sc.line_entry = line_entry;
+    } else {
+      line_entry.range.GetBaseAddress().CalculateSymbolContext(&sc,
+                                                               resolve_scope);
+    }
+
     sc_list.Append(sc);
+    if (num_file_indexes == 1)
+      line_idx = line_table->FindLineEntryIndexByFileIndex(
+          line_idx + 1, file_indexes.front(), found_line, true, &line_entry);
+    else
+      line_idx = line_table->FindLineEntryIndexByFileIndex(
+          line_idx + 1, file_indexes, found_line, true, &line_entry);
   }
-  return sc_list.GetSize() - prev_size;
 }
 
 bool CompileUnit::GetIsOptimized() {
diff --git a/lldb/source/Symbol/CompilerDecl.cpp b/lldb/source/Symbol/CompilerDecl.cpp
index 2c64113a2bbeb..48d9169c1a7a2 100644
--- a/lldb/source/Symbol/CompilerDecl.cpp
+++ b/lldb/source/Symbol/CompilerDecl.cpp
@@ -12,10 +12,6 @@
 
 using namespace lldb_private;
 
-bool CompilerDecl::IsClang() const {
-  return IsValid() && m_type_system->getKind() == TypeSystem::eKindClang;
-}
-
 ConstString CompilerDecl::GetName() const {
   return m_type_system->DeclGetName(m_opaque_decl);
 }
diff --git a/lldb/source/Symbol/CompilerDeclContext.cpp b/lldb/source/Symbol/CompilerDeclContext.cpp
index a6f046c4eb22e..672de6ec34d1e 100644
--- a/lldb/source/Symbol/CompilerDeclContext.cpp
+++ b/lldb/source/Symbol/CompilerDeclContext.cpp
@@ -23,10 +23,6 @@ CompilerDeclContext::FindDeclByName(ConstString name,
     return std::vector<CompilerDecl>();
 }
 
-bool CompilerDeclContext::IsClang() const {
-  return IsValid() && m_type_system->getKind() == TypeSystem::eKindClang;
-}
-
 ConstString CompilerDeclContext::GetName() const {
   if (IsValid())
     return m_type_system->DeclContextGetName(m_opaque_decl_ctx);
diff --git a/lldb/source/Symbol/CompilerType.cpp b/lldb/source/Symbol/CompilerType.cpp
index 571a8570a43b3..d35213120b4dc 100644
--- a/lldb/source/Symbol/CompilerType.cpp
+++ b/lldb/source/Symbol/CompilerType.cpp
@@ -874,173 +874,6 @@ bool CompilerType::GetValueAsScalar(const lldb_private::DataExtractor &data,
   return false;
 }
 
-bool CompilerType::SetValueFromScalar(const Scalar &value, Stream &strm) {
-  if (!IsValid())
-    return false;
-
-  // Aggregate types don't have scalar values
-  if (!IsAggregateType()) {
-    strm.GetFlags().Set(Stream::eBinary);
-    uint64_t count = 0;
-    lldb::Encoding encoding = GetEncoding(count);
-
-    if (encoding == lldb::eEncodingInvalid || count != 1)
-      return false;
-
-    llvm::Optional<uint64_t> bit_width = GetBitSize(nullptr);
-    if (!bit_width)
-      return false;
-
-    // This function doesn't currently handle non-byte aligned assignments
-    if ((*bit_width % 8) != 0)
-      return false;
-
-    const uint64_t byte_size = (*bit_width + 7) / 8;
-    switch (encoding) {
-    case lldb::eEncodingInvalid:
-      break;
-    case lldb::eEncodingVector:
-      break;
-    case lldb::eEncodingUint:
-      switch (byte_size) {
-      case 1:
-        strm.PutHex8(value.UInt());
-        return true;
-      case 2:
-        strm.PutHex16(value.UInt());
-        return true;
-      case 4:
-        strm.PutHex32(value.UInt());
-        return true;
-      case 8:
-        strm.PutHex64(value.ULongLong());
-        return true;
-      default:
-        break;
-      }
-      break;
-
-    case lldb::eEncodingSint:
-      switch (byte_size) {
-      case 1:
-        strm.PutHex8(value.SInt());
-        return true;
-      case 2:
-        strm.PutHex16(value.SInt());
-        return true;
-      case 4:
-        strm.PutHex32(value.SInt());
-        return true;
-      case 8:
-        strm.PutHex64(value.SLongLong());
-        return true;
-      default:
-        break;
-      }
-      break;
-
-    case lldb::eEncodingIEEE754:
-      if (byte_size <= sizeof(long double)) {
-        if (byte_size == sizeof(float)) {
-          strm.PutFloat(value.Float());
-          return true;
-        } else if (byte_size == sizeof(double)) {
-          strm.PutDouble(value.Double());
-          return true;
-        } else if (byte_size == sizeof(long double)) {
-          strm.PutDouble(value.LongDouble());
-          return true;
-        }
-      }
-      break;
-    }
-  }
-  return false;
-}
-
-bool CompilerType::ReadFromMemory(lldb_private::ExecutionContext *exe_ctx,
-                                  lldb::addr_t addr, AddressType address_type,
-                                  lldb_private::DataExtractor &data) {
-  if (!IsValid())
-    return false;
-
-  // Can't convert a file address to anything valid without more context (which
-  // Module it came from)
-  if (address_type == eAddressTypeFile)
-    return false;
-
-  if (!GetCompleteType())
-    return false;
-
-  auto byte_size =
-      GetByteSize(exe_ctx ? exe_ctx->GetBestExecutionContextScope() : nullptr);
-  if (!byte_size)
-    return false;
-
-  if (data.GetByteSize() < *byte_size) {
-    lldb::DataBufferSP data_sp(new DataBufferHeap(*byte_size, '\0'));
-    data.SetData(data_sp);
-  }
-
-  uint8_t *dst = const_cast<uint8_t *>(data.PeekData(0, *byte_size));
-  if (dst != nullptr) {
-    if (address_type == eAddressTypeHost) {
-      if (addr == 0)
-        return false;
-      // The address is an address in this process, so just copy it
-      memcpy(dst, reinterpret_cast<uint8_t *>(addr), *byte_size);
-      return true;
-    } else {
-      Process *process = nullptr;
-      if (exe_ctx)
-        process = exe_ctx->GetProcessPtr();
-      if (process) {
-        Status error;
-        return process->ReadMemory(addr, dst, *byte_size, error) == *byte_size;
-      }
-    }
-  }
-  return false;
-}
-
-bool CompilerType::WriteToMemory(lldb_private::ExecutionContext *exe_ctx,
-                                 lldb::addr_t addr, AddressType address_type,
-                                 StreamString &new_value) {
-  if (!IsValid())
-    return false;
-
-  // Can't convert a file address to anything valid without more context (which
-  // Module it came from)
-  if (address_type == eAddressTypeFile)
-    return false;
-
-  if (!GetCompleteType())
-    return false;
-
-  auto byte_size =
-      GetByteSize(exe_ctx ? exe_ctx->GetBestExecutionContextScope() : nullptr);
-  if (!byte_size)
-    return false;
-
-  if (*byte_size > 0) {
-    if (address_type == eAddressTypeHost) {
-      // The address is an address in this process, so just copy it
-      memcpy((void *)addr, new_value.GetData(), *byte_size);
-      return true;
-    } else {
-      Process *process = nullptr;
-      if (exe_ctx)
-        process = exe_ctx->GetProcessPtr();
-      if (process) {
-        Status error;
-        return process->WriteMemory(addr, new_value.GetData(), *byte_size,
-                                    error) == *byte_size;
-      }
-    }
-  }
-  return false;
-}
-
 bool lldb_private::operator==(const lldb_private::CompilerType &lhs,
                               const lldb_private::CompilerType &rhs) {
   return lhs.GetTypeSystem() == rhs.GetTypeSystem() &&
diff --git a/lldb/source/Symbol/Declaration.cpp b/lldb/source/Symbol/Declaration.cpp
index d78ba967d280b..4d0975d34256c 100644
--- a/lldb/source/Symbol/Declaration.cpp
+++ b/lldb/source/Symbol/Declaration.cpp
@@ -90,12 +90,9 @@ bool Declaration::FileAndLineEqual(const Declaration &declaration) const {
 
 bool lldb_private::operator==(const Declaration &lhs, const Declaration &rhs) {
 #ifdef LLDB_ENABLE_DECLARATION_COLUMNS
-  if (lhs.GetColumn() == rhs.GetColumn())
-    if (lhs.GetLine() == rhs.GetLine())
-      return lhs.GetFile() == rhs.GetFile();
+  if (lhs.GetColumn() != rhs.GetColumn())
+    return false;
 #else
-  if (lhs.GetLine() == rhs.GetLine())
-    return FileSpec::Equal(lhs.GetFile(), rhs.GetFile(), true);
+  return lhs.GetLine() == rhs.GetLine() && lhs.GetFile() == rhs.GetFile();
 #endif
-  return false;
 }
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index 9e81b6140eb76..c392317df0066 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -340,7 +340,8 @@ Block &Function::GetBlock(bool can_create) {
                       "error: unable to find module "
                       "shared pointer for function '%s' "
                       "in %s\n",
-                      GetName().GetCString(), m_comp_unit->GetPath().c_str());
+                      GetName().GetCString(),
+                      m_comp_unit->GetPrimaryFile().GetPath().c_str());
     }
     m_block.SetBlockInfoHasBeenParsed(true, true);
   }
diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp
index 1433dc156d915..fecc90c409f22 100644
--- a/lldb/source/Symbol/LineTable.cpp
+++ b/lldb/source/Symbol/LineTable.cpp
@@ -34,11 +34,9 @@ void LineTable::InsertLineEntry(lldb::addr_t file_addr, uint32_t line,
               is_start_of_basic_block, is_prologue_end, is_epilogue_begin,
               is_terminal_entry);
 
-  entry_collection::iterator begin_pos = m_entries.begin();
-  entry_collection::iterator end_pos = m_entries.end();
   LineTable::Entry::LessThanBinaryPredicate less_than_bp(this);
   entry_collection::iterator pos =
-      upper_bound(begin_pos, end_pos, entry, less_than_bp);
+      llvm::upper_bound(m_entries, entry, less_than_bp);
 
   //  Stream s(stdout);
   //  s << "\n\nBefore:\n";
@@ -289,8 +287,6 @@ uint32_t LineTable::FindLineEntryIndexByFileIndex(
     uint32_t line, bool exact, LineEntry *line_entry_ptr) {
 
   const size_t count = m_entries.size();
-  std::vector<uint32_t>::const_iterator begin_pos = file_indexes.begin();
-  std::vector<uint32_t>::const_iterator end_pos = file_indexes.end();
   size_t best_match = UINT32_MAX;
 
   for (size_t idx = start_idx; idx < count; ++idx) {
@@ -299,7 +295,7 @@ uint32_t LineTable::FindLineEntryIndexByFileIndex(
     if (m_entries[idx].is_terminal_entry)
       continue;
 
-    if (find(begin_pos, end_pos, m_entries[idx].file_idx) == end_pos)
+    if (llvm::find(file_indexes, m_entries[idx].file_idx) == file_indexes.end())
       continue;
 
     // Exact match always wins.  Otherwise try to find the closest line > the
diff --git a/lldb/source/Symbol/LocateSymbolFile.cpp b/lldb/source/Symbol/LocateSymbolFile.cpp
index 0d0e5300668fc..d2b39d6acd704 100644
--- a/lldb/source/Symbol/LocateSymbolFile.cpp
+++ b/lldb/source/Symbol/LocateSymbolFile.cpp
@@ -230,19 +230,19 @@ static FileSpec LocateExecutableSymbolFileDsym(const ModuleSpec &module_spec) {
 
 ModuleSpec Symbols::LocateExecutableObjectFile(const ModuleSpec &module_spec) {
   ModuleSpec result;
-  const FileSpec *exec_fspec = module_spec.GetFileSpecPtr();
+  const FileSpec &exec_fspec = module_spec.GetFileSpec();
   const ArchSpec *arch = module_spec.GetArchitecturePtr();
   const UUID *uuid = module_spec.GetUUIDPtr();
   static Timer::Category func_cat(LLVM_PRETTY_FUNCTION);
   Timer scoped_timer(
       func_cat, "LocateExecutableObjectFile (file = %s, arch = %s, uuid = %p)",
-      exec_fspec ? exec_fspec->GetFilename().AsCString("<NULL>") : "<NULL>",
+      exec_fspec ? exec_fspec.GetFilename().AsCString("<NULL>") : "<NULL>",
       arch ? arch->GetArchitectureName() : "<NULL>", (const void *)uuid);
 
   ModuleSpecList module_specs;
   ModuleSpec matched_module_spec;
   if (exec_fspec &&
-      ObjectFile::GetModuleSpecifications(*exec_fspec, 0, 0, module_specs) &&
+      ObjectFile::GetModuleSpecifications(exec_fspec, 0, 0, module_specs) &&
       module_specs.FindMatchingModuleSpec(module_spec, matched_module_spec)) {
     result.GetFileSpec() = exec_fspec;
   } else {
diff --git a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
index 74718a8c5e307..5ee632ec20773 100644
--- a/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
+++ b/lldb/source/Symbol/LocateSymbolFileMacOSX.cpp
@@ -595,7 +595,7 @@ bool Symbols::DownloadObjectAndSymbolFile(ModuleSpec &module_spec,
         }
         Status error = Host::RunShellCommand(
             command.GetData(),
-            NULL,            // current working directory
+            FileSpec(),      // current working directory
             &exit_status,    // Exit status
             &signo,          // Signal int *
             &command_output, // Command output
diff --git a/lldb/source/Symbol/ObjectFile.cpp b/lldb/source/Symbol/ObjectFile.cpp
index 38bc7722d0d02..812c6de4da52b 100644
--- a/lldb/source/Symbol/ObjectFile.cpp
+++ b/lldb/source/Symbol/ObjectFile.cpp
@@ -360,6 +360,7 @@ AddressClass ObjectFile::GetAddressClass(addr_t file_addr) {
           case eSectionTypeDWARFDebugPubTypes:
           case eSectionTypeDWARFDebugRanges:
           case eSectionTypeDWARFDebugRngLists:
+          case eSectionTypeDWARFDebugRngListsDwo:
           case eSectionTypeDWARFDebugStr:
           case eSectionTypeDWARFDebugStrDwo:
           case eSectionTypeDWARFDebugStrOffsets:
@@ -476,7 +477,13 @@ size_t ObjectFile::GetData(lldb::offset_t offset, size_t length,
                            DataExtractor &data) const {
   // The entire file has already been mmap'ed into m_data, so just copy from
   // there as the back mmap buffer will be shared with shared pointers.
-  return data.SetData(m_data, offset, length);
+  size_t ret = data.SetData(m_data, offset, length);
+  // DataExtractor::SetData copies the address byte size from m_data, but
+  // m_data's address byte size is only set from sizeof(void*), and we can't
+  // access subclasses GetAddressByteSize() when setting up m_data in the
+  // constructor.
+  data.SetAddressByteSize(GetAddressByteSize());
+  return ret;
 }
 
 size_t ObjectFile::CopyData(lldb::offset_t offset, size_t length,
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index 7828ca613359d..b77c011f8cb8b 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -315,14 +315,14 @@ void SymbolContext::Dump(Stream *s, Target *target) const {
   s->Indent();
   *s << "CompileUnit  = " << comp_unit;
   if (comp_unit != nullptr)
-    *s << " {0x" << comp_unit->GetID() << "} "
-       << *(static_cast<FileSpec *>(comp_unit));
+    s->Format(" {{{0:x-16}} {1}", comp_unit->GetID(),
+              comp_unit->GetPrimaryFile());
   s->EOL();
   s->Indent();
   *s << "Function     = " << function;
   if (function != nullptr) {
-    *s << " {0x" << function->GetID() << "} " << function->GetType()->GetName()
-       << ", address-range = ";
+    s->Format(" {{{0:x-16}} {1}, address-range = ", function->GetID(),
+              function->GetType()->GetName());
     function->GetAddressRange().Dump(s, target, Address::DumpStyleLoadAddress,
                                      Address::DumpStyleModuleWithFileAddress);
     s->EOL();
@@ -337,10 +337,7 @@ void SymbolContext::Dump(Stream *s, Target *target) const {
   s->Indent();
   *s << "Block        = " << block;
   if (block != nullptr)
-    *s << " {0x" << block->GetID() << '}';
-  // Dump the block and pass it a negative depth to we print all the parent
-  // blocks if (block != NULL)
-  //  block->Dump(s, function->GetFileAddress(), INT_MIN);
+    s->Format(" {{{0:x-16}}", block->GetID());
   s->EOL();
   s->Indent();
   *s << "LineEntry    = ";
@@ -354,7 +351,8 @@ void SymbolContext::Dump(Stream *s, Target *target) const {
   s->EOL();
   *s << "Variable     = " << variable;
   if (variable != nullptr) {
-    *s << " {0x" << variable->GetID() << "} " << variable->GetType()->GetName();
+    s->Format(" {{{0:x-16}} {1}", variable->GetID(),
+              variable->GetType()->GetName());
     s->EOL();
   }
   s->IndentLess();
@@ -1028,8 +1026,7 @@ bool SymbolContextSpecifier::SymbolContextMatches(SymbolContext &sc) {
           return false;
       } else {
         FileSpec module_file_spec(m_module_spec);
-        if (!FileSpec::Equal(module_file_spec, sc.module_sp->GetFileSpec(),
-                             false))
+        if (!FileSpec::Match(module_file_spec, sc.module_sp->GetFileSpec()))
           return false;
       }
     }
@@ -1048,8 +1045,8 @@ bool SymbolContextSpecifier::SymbolContextMatches(SymbolContext &sc) {
             sc.block->GetInlinedFunctionInfo();
         if (inline_info != nullptr) {
           was_inlined = true;
-          if (!FileSpec::Equal(inline_info->GetDeclaration().GetFile(),
-                               *(m_file_spec_up.get()), false))
+          if (!FileSpec::Match(*m_file_spec_up,
+                               inline_info->GetDeclaration().GetFile()))
             return false;
         }
       }
@@ -1057,7 +1054,7 @@ bool SymbolContextSpecifier::SymbolContextMatches(SymbolContext &sc) {
       // Next check the comp unit, but only if the SymbolContext was not
       // inlined.
       if (!was_inlined && sc.comp_unit != nullptr) {
-        if (!FileSpec::Equal(*(sc.comp_unit), *(m_file_spec_up.get()), false))
+        if (!FileSpec::Match(*m_file_spec_up, sc.comp_unit->GetPrimaryFile()))
           return false;
       }
     }
diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 9a2b5cddd73b7..c7a6bf2145267 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -13,7 +13,6 @@
 
 #include "lldb/Core/Module.h"
 #include "lldb/Core/RichManglingContext.h"
-#include "lldb/Core/STLUtils.h"
 #include "lldb/Core/Section.h"
 #include "lldb/Symbol/ObjectFile.h"
 #include "lldb/Symbol/Symbol.h"
@@ -107,10 +106,8 @@ void Symtab::Dump(Stream *s, Target *target, SortOrder sort_order,
       // sorted by name. So we must make the ordered symbol list up ourselves.
       s->PutCString(" (sorted by name):\n");
       DumpSymbolHeader(s);
-      typedef std::multimap<const char *, const Symbol *,
-                            CStringCompareFunctionObject>
-          CStringToSymbol;
-      CStringToSymbol name_map;
+
+      std::multimap<llvm::StringRef, const Symbol *> name_map;
       for (const_iterator pos = m_symbols.begin(), end = m_symbols.end();
            pos != end; ++pos) {
         const char *name = pos->GetName().AsCString();
@@ -118,12 +115,10 @@ void Symtab::Dump(Stream *s, Target *target, SortOrder sort_order,
           name_map.insert(std::make_pair(name, &(*pos)));
       }
 
-      for (CStringToSymbol::const_iterator pos = name_map.begin(),
-                                           end = name_map.end();
-           pos != end; ++pos) {
+      for (const auto &name_to_symbol : name_map) {
+        const Symbol *symbol = name_to_symbol.second;
         s->Indent();
-        pos->second->Dump(s, target, pos->second - &m_symbols[0],
-                          name_preference);
+        symbol->Dump(s, target, symbol - &m_symbols[0], name_preference);
       }
     } break;
 
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index c3e5c03709517..6465ce3dd156f 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -255,7 +255,7 @@ void Type::Dump(Stream *s, bool show_context) {
     *s << ", compiler_type = " << m_compiler_type.GetOpaqueQualType() << ' ';
     GetForwardCompilerType().DumpTypeDescription(s);
   } else if (m_encoding_uid != LLDB_INVALID_UID) {
-    *s << ", type_data = " << (uint64_t)m_encoding_uid;
+    s->Format(", type_data = {0:x-16}", m_encoding_uid);
     switch (m_encoding_uid_type) {
     case eEncodingInvalid:
       break;
diff --git a/lldb/source/Symbol/TypeSystem.cpp b/lldb/source/Symbol/TypeSystem.cpp
index c63f24aea3354..4e746bd18e1f3 100644
--- a/lldb/source/Symbol/TypeSystem.cpp
+++ b/lldb/source/Symbol/TypeSystem.cpp
@@ -43,8 +43,6 @@ size_t LanguageSet::Size() const { return bitvector.count(); }
 bool LanguageSet::Empty() const { return bitvector.none(); }
 bool LanguageSet::operator[](unsigned i) const { return bitvector[i]; }
 
-TypeSystem::TypeSystem(LLVMCastKind kind) : m_kind(kind), m_sym_file(nullptr) {}
-
 TypeSystem::~TypeSystem() {}
 
 static lldb::TypeSystemSP CreateInstanceHelper(lldb::LanguageType language,
diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp
index 427dbf459c4eb..fc7d127a326fa 100644
--- a/lldb/source/Symbol/Variable.cpp
+++ b/lldb/source/Symbol/Variable.cpp
@@ -112,7 +112,7 @@ void Variable::Dump(Stream *s, bool show_context) const {
   if (m_symfile_type_sp) {
     Type *type = m_symfile_type_sp->GetType();
     if (type) {
-      *s << ", type = {" << type->GetID() << "} " << (void *)type << " (";
+      s->Format(", type = {{{0:x-16}} {1} (", type->GetID(), type);
       type->DumpTypeName(s);
       s->PutChar(')');
     }
@@ -134,7 +134,7 @@ void Variable::Dump(Stream *s, bool show_context) const {
       s->PutCString("thread local");
       break;
     default:
-      *s << "??? (" << m_scope << ')';
+      s->AsRawOstream() << "??? (" << m_scope << ')';
     }
   }
 
@@ -487,13 +487,6 @@ static void PrivateAutoComplete(
         &prefix_path, // Anything that has been resolved already will be in here
     const CompilerType &compiler_type, CompletionRequest &request);
 
-static void PrivateAutoCompleteMembers(
-    StackFrame *frame, const std::string &partial_member_name,
-    llvm::StringRef partial_path,
-    const llvm::Twine
-        &prefix_path, // Anything that has been resolved already will be in here
-    const CompilerType &compiler_type, CompletionRequest &request);
-
 static void PrivateAutoCompleteMembers(
     StackFrame *frame, const std::string &partial_member_name,
     llvm::StringRef partial_path,
diff --git a/lldb/source/Target/ABI.cpp b/lldb/source/Target/ABI.cpp
index 005261e0ddee0..6217ee2ed9ced 100644
--- a/lldb/source/Target/ABI.cpp
+++ b/lldb/source/Target/ABI.cpp
@@ -63,24 +63,6 @@ bool ABI::GetRegisterInfoByName(ConstString name, RegisterInfo &info) {
   return false;
 }
 
-bool ABI::GetRegisterInfoByKind(RegisterKind reg_kind, uint32_t reg_num,
-                                RegisterInfo &info) {
-  if (reg_kind < eRegisterKindEHFrame || reg_kind >= kNumRegisterKinds)
-    return false;
-
-  uint32_t count = 0;
-  const RegisterInfo *register_info_array = GetRegisterInfoArray(count);
-  if (register_info_array) {
-    for (uint32_t i = 0; i < count; ++i) {
-      if (register_info_array[i].kinds[reg_kind] == reg_num) {
-        info = register_info_array[i];
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 ValueObjectSP ABI::GetReturnValueObject(Thread &thread, CompilerType &ast_type,
                                         bool persistent) const {
   if (!ast_type.IsValid())
@@ -229,3 +211,20 @@ std::unique_ptr<llvm::MCRegisterInfo> ABI::MakeMCRegisterInfo(const ArchSpec &ar
   assert(info_up);
   return info_up;
 }
+
+void ABI::AugmentRegisterInfo(RegisterInfo &info) {
+  if (info.kinds[eRegisterKindEHFrame] != LLDB_INVALID_REGNUM &&
+      info.kinds[eRegisterKindDWARF] != LLDB_INVALID_REGNUM)
+    return;
+
+  RegisterInfo abi_info;
+  if (!GetRegisterInfoByName(ConstString(info.name), abi_info))
+    return;
+
+  if (info.kinds[eRegisterKindEHFrame] == LLDB_INVALID_REGNUM)
+    info.kinds[eRegisterKindEHFrame] = abi_info.kinds[eRegisterKindEHFrame];
+  if (info.kinds[eRegisterKindDWARF] == LLDB_INVALID_REGNUM)
+    info.kinds[eRegisterKindDWARF] = abi_info.kinds[eRegisterKindDWARF];
+  if (info.kinds[eRegisterKindGeneric] == LLDB_INVALID_REGNUM)
+    info.kinds[eRegisterKindGeneric] = abi_info.kinds[eRegisterKindGeneric];
+}
diff --git a/lldb/source/Target/Platform.cpp b/lldb/source/Target/Platform.cpp
index c9849a9e5f09f..aaf48f35f921d 100644
--- a/lldb/source/Target/Platform.cpp
+++ b/lldb/source/Target/Platform.cpp
@@ -406,7 +406,7 @@ void Platform::GetStatus(Stream &strm) {
   if (arch.IsValid()) {
     if (!arch.GetTriple().str().empty()) {
       strm.Printf("    Triple: ");
-      arch.DumpTriple(strm);
+      arch.DumpTriple(strm.AsRawOstream());
       strm.EOL();
     }
   }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index ed0b951fbce1a..a731a353c1bc1 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -1486,8 +1486,7 @@ const lldb::ABISP &Process::GetABI() {
   return m_abi_sp;
 }
 
-std::vector<LanguageRuntime *>
-Process::GetLanguageRuntimes(bool retry_if_null) {
+std::vector<LanguageRuntime *> Process::GetLanguageRuntimes() {
   std::vector<LanguageRuntime *> language_runtimes;
 
   if (m_finalizing)
@@ -1500,15 +1499,14 @@ Process::GetLanguageRuntimes(bool retry_if_null) {
   // yet or the proper condition for loading wasn't yet met (e.g. libc++.so
   // hadn't been loaded).
   for (const lldb::LanguageType lang_type : Language::GetSupportedLanguages()) {
-    if (LanguageRuntime *runtime = GetLanguageRuntime(lang_type, retry_if_null))
+    if (LanguageRuntime *runtime = GetLanguageRuntime(lang_type))
       language_runtimes.emplace_back(runtime);
   }
 
   return language_runtimes;
 }
 
-LanguageRuntime *Process::GetLanguageRuntime(lldb::LanguageType language,
-                                             bool retry_if_null) {
+LanguageRuntime *Process::GetLanguageRuntime(lldb::LanguageType language) {
   if (m_finalizing)
     return nullptr;
 
@@ -1517,7 +1515,7 @@ LanguageRuntime *Process::GetLanguageRuntime(lldb::LanguageType language,
   std::lock_guard<std::recursive_mutex> guard(m_language_runtimes_mutex);
   LanguageRuntimeCollection::iterator pos;
   pos = m_language_runtimes.find(language);
-  if (pos == m_language_runtimes.end() || (retry_if_null && !pos->second)) {
+  if (pos == m_language_runtimes.end() || !pos->second) {
     lldb::LanguageRuntimeSP runtime_sp(
         LanguageRuntime::FindPlugin(this, language));
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 4b9a1b77ad16d..59f72141ee5fc 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -404,8 +404,8 @@ Target::CreateAddressInModuleBreakpoint(lldb::addr_t file_addr, bool internal,
                                         bool request_hardware) {
   SearchFilterSP filter_sp(
       new SearchFilterForUnconstrainedSearches(shared_from_this()));
-  BreakpointResolverSP resolver_sp(
-      new BreakpointResolverAddress(nullptr, file_addr, file_spec));
+  BreakpointResolverSP resolver_sp(new BreakpointResolverAddress(
+      nullptr, file_addr, file_spec ? *file_spec : FileSpec()));
   return CreateBreakpoint(filter_sp, resolver_sp, internal, request_hardware,
                           false);
 }
@@ -728,11 +728,17 @@ void Target::ConfigureBreakpointName(
 }
 
 void Target::ApplyNameToBreakpoints(BreakpointName &bp_name) {
-  BreakpointList bkpts_with_name(false);
-  m_breakpoint_list.FindBreakpointsByName(bp_name.GetName().AsCString(),
-                                          bkpts_with_name);
+  llvm::Expected<std::vector<BreakpointSP>> expected_vector =
+      m_breakpoint_list.FindBreakpointsByName(bp_name.GetName().AsCString());
+
+  if (!expected_vector) {
+    LLDB_LOG(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_BREAKPOINTS),
+             "invalid breakpoint name: {}",
+             llvm::toString(expected_vector.takeError()));
+    return;
+  }
 
-  for (auto bp_sp : bkpts_with_name.Breakpoints())
+  for (auto bp_sp : *expected_vector)
     bp_name.ConfigureBreakpoint(bp_sp);
 }
 
@@ -1425,8 +1431,7 @@ void Target::SetExecutableModule(ModuleSP &executable_sp,
       ModuleList added_modules;
       executable_objfile->GetDependentModules(dependent_files);
       for (uint32_t i = 0; i < dependent_files.GetSize(); i++) {
-        FileSpec dependent_file_spec(
-            dependent_files.GetFileSpecPointerAtIndex(i));
+        FileSpec dependent_file_spec(dependent_files.GetFileSpecAtIndex(i));
         FileSpec platform_dependent_file_spec;
         if (m_platform_sp)
           m_platform_sp->GetFileWithUUID(dependent_file_spec, nullptr,
@@ -3177,7 +3182,7 @@ void Target::StopHook::SetThreadSpecifier(ThreadSpec *specifier) {
 
 void Target::StopHook::GetDescription(Stream *s,
                                       lldb::DescriptionLevel level) const {
-  int indent_level = s->GetIndentLevel();
+  unsigned indent_level = s->GetIndentLevel();
 
   s->SetIndentLevel(indent_level + 2);
 
@@ -4094,7 +4099,7 @@ void Target::TargetEventData::Dump(Stream *s) const {
     if (i != 0)
       *s << ", ";
     m_module_list.GetModuleAtIndex(i)->GetDescription(
-        s, lldb::eDescriptionLevelBrief);
+        s->AsRawOstream(), lldb::eDescriptionLevelBrief);
   }
 }
 
diff --git a/lldb/source/Target/TargetList.cpp b/lldb/source/Target/TargetList.cpp
index 7c7a36e97bbfe..1b4db0c2aba59 100644
--- a/lldb/source/Target/TargetList.cpp
+++ b/lldb/source/Target/TargetList.cpp
@@ -144,9 +144,9 @@ Status TargetList::CreateTargetInternal(
               StreamString platform_arch_strm;
               StreamString module_arch_strm;
 
-              platform_arch.DumpTriple(platform_arch_strm);
+              platform_arch.DumpTriple(platform_arch_strm.AsRawOstream());
               matching_module_spec.GetArchitecture().DumpTriple(
-                  module_arch_strm);
+                  module_arch_strm.AsRawOstream());
               error.SetErrorStringWithFormat(
                   "the specified architecture '%s' is not compatible with '%s' "
                   "in '%s'",
@@ -457,15 +457,12 @@ TargetSP TargetList::FindTargetWithExecutableAndArchitecture(
     const FileSpec &exe_file_spec, const ArchSpec *exe_arch_ptr) const {
   std::lock_guard<std::recursive_mutex> guard(m_target_list_mutex);
   TargetSP target_sp;
-  bool full_match = (bool)exe_file_spec.GetDirectory();
-
   collection::const_iterator pos, end = m_target_list.end();
   for (pos = m_target_list.begin(); pos != end; ++pos) {
     Module *exe_module = (*pos)->GetExecutableModulePointer();
 
     if (exe_module) {
-      if (FileSpec::Equal(exe_file_spec, exe_module->GetFileSpec(),
-                          full_match)) {
+      if (FileSpec::Match(exe_file_spec, exe_module->GetFileSpec())) {
         if (exe_arch_ptr) {
           if (!exe_arch_ptr->IsCompatibleMatch(exe_module->GetArchitecture()))
             continue;
diff --git a/lldb/source/Target/ThreadPlanStepInRange.cpp b/lldb/source/Target/ThreadPlanStepInRange.cpp
index 77772aed516bd..fdb2782bc5182 100644
--- a/lldb/source/Target/ThreadPlanStepInRange.cpp
+++ b/lldb/source/Target/ThreadPlanStepInRange.cpp
@@ -339,7 +339,7 @@ bool ThreadPlanStepInRange::FrameMatchesAvoidCriteria() {
     if (frame_library) {
       for (size_t i = 0; i < num_libraries; i++) {
         const FileSpec &file_spec(libraries_to_avoid.GetFileSpecAtIndex(i));
-        if (FileSpec::Equal(file_spec, frame_library, false)) {
+        if (FileSpec::Match(file_spec, frame_library)) {
           libraries_say_avoid = true;
           break;
         }
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 62d9d246255a1..bbfa5cf61d014 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -868,7 +868,7 @@ void ArchSpec::MergeFrom(const ArchSpec &other) {
       IsCompatibleMatch(other) && GetCore() == ArchSpec::eCore_arm_generic &&
       other.GetCore() != ArchSpec::eCore_arm_generic) {
     m_core = other.GetCore();
-    CoreUpdated(true);
+    CoreUpdated(false);
   }
   if (GetFlags() == 0) {
     SetFlags(other.GetFlags());
@@ -1443,21 +1443,24 @@ bool ArchSpec::IsAlwaysThumbInstructions() const {
         GetCore() == ArchSpec::Core::eCore_thumbv6m) {
       return true;
     }
+    // Windows on ARM is always thumb.
+    if (GetTriple().isOSWindows())
+      return true;
   }
   return false;
 }
 
-void ArchSpec::DumpTriple(Stream &s) const {
+void ArchSpec::DumpTriple(llvm::raw_ostream &s) const {
   const llvm::Triple &triple = GetTriple();
   llvm::StringRef arch_str = triple.getArchName();
   llvm::StringRef vendor_str = triple.getVendorName();
   llvm::StringRef os_str = triple.getOSName();
   llvm::StringRef environ_str = triple.getEnvironmentName();
 
-  s.Printf("%s-%s-%s", arch_str.empty() ? "*" : arch_str.str().c_str(),
-           vendor_str.empty() ? "*" : vendor_str.str().c_str(),
-           os_str.empty() ? "*" : os_str.str().c_str());
+  s << llvm::formatv("{0}-{1}-{2}", arch_str.empty() ? "*" : arch_str,
+                     vendor_str.empty() ? "*" : vendor_str,
+                     os_str.empty() ? "*" : os_str);
 
   if (!environ_str.empty())
-    s.Printf("-%s", environ_str.str().c_str());
+    s << "-" << environ_str;
 }
diff --git a/lldb/source/Utility/Baton.cpp b/lldb/source/Utility/Baton.cpp
index 84e295e246864..7bba10dcec962 100644
--- a/lldb/source/Utility/Baton.cpp
+++ b/lldb/source/Utility/Baton.cpp
@@ -8,5 +8,6 @@
 
 #include "lldb/Utility/Baton.h"
 
-void lldb_private::UntypedBaton::GetDescription(
-    Stream *s, lldb::DescriptionLevel level) const {}
+void lldb_private::UntypedBaton::GetDescription(llvm::raw_ostream &s,
+                                                lldb::DescriptionLevel level,
+                                                unsigned indentation) const {}
diff --git a/lldb/source/Utility/FileSpec.cpp b/lldb/source/Utility/FileSpec.cpp
index 88966843072b6..a9e542991f179 100644
--- a/lldb/source/Utility/FileSpec.cpp
+++ b/lldb/source/Utility/FileSpec.cpp
@@ -75,15 +75,6 @@ FileSpec::FileSpec(llvm::StringRef path, Style style) : m_style(style) {
 FileSpec::FileSpec(llvm::StringRef path, const llvm::Triple &triple)
     : FileSpec{path, triple.isOSWindows() ? Style::windows : Style::posix} {}
 
-// Copy constructor
-FileSpec::FileSpec(const FileSpec *rhs) : m_directory(), m_filename() {
-  if (rhs)
-    *this = *rhs;
-}
-
-// Virtual destructor in case anyone inherits from this class.
-FileSpec::~FileSpec() {}
-
 namespace {
 /// Safely get a character at the specified index.
 ///
@@ -302,20 +293,18 @@ int FileSpec::Compare(const FileSpec &a, const FileSpec &b, bool full) {
 }
 
 bool FileSpec::Equal(const FileSpec &a, const FileSpec &b, bool full) {
-  // case sensitivity of equality test
-  const bool case_sensitive = a.IsCaseSensitive() || b.IsCaseSensitive();
+  if (full || (a.GetDirectory() && b.GetDirectory()))
+    return a == b;
 
-  const bool filenames_equal = ConstString::Equals(a.m_filename,
-                                                   b.m_filename,
-                                                   case_sensitive);
-
-  if (!filenames_equal)
-    return false;
-
-  if (!full && (a.GetDirectory().IsEmpty() || b.GetDirectory().IsEmpty()))
-    return filenames_equal;
+  return a.FileEquals(b);
+}
 
-  return a == b;
+bool FileSpec::Match(const FileSpec &pattern, const FileSpec &file) {
+  if (pattern.GetDirectory())
+    return pattern == file;
+  if (pattern.GetFilename())
+    return pattern.FileEquals(file);
+  return true;
 }
 
 llvm::Optional<FileSpec::Style> FileSpec::GuessPathStyle(llvm::StringRef absolute_path) {
diff --git a/lldb/source/Utility/ProcessInfo.cpp b/lldb/source/Utility/ProcessInfo.cpp
index 5743d223be4fa..a02ee1af867a0 100644
--- a/lldb/source/Utility/ProcessInfo.cpp
+++ b/lldb/source/Utility/ProcessInfo.cpp
@@ -49,7 +49,7 @@ llvm::StringRef ProcessInfo::GetNameAsStringRef() const {
 void ProcessInfo::Dump(Stream &s, Platform *platform) const {
   s << "Executable: " << GetName() << "\n";
   s << "Triple: ";
-  m_arch.DumpTriple(s);
+  m_arch.DumpTriple(s.AsRawOstream());
   s << "\n";
 
   s << "Arguments:\n";
@@ -137,7 +137,7 @@ void ProcessInstanceInfo::Dump(Stream &s, UserIDResolver &resolver) const {
 
   if (m_arch.IsValid()) {
     s.Printf("   arch = ");
-    m_arch.DumpTriple(s);
+    m_arch.DumpTriple(s.AsRawOstream());
     s.EOL();
   }
 
@@ -189,7 +189,7 @@ void ProcessInstanceInfo::DumpAsTableRow(Stream &s, UserIDResolver &resolver,
 
     StreamString arch_strm;
     if (m_arch.IsValid())
-      m_arch.DumpTriple(arch_strm);
+      m_arch.DumpTriple(arch_strm.AsRawOstream());
 
     auto print = [&](bool (ProcessInstanceInfo::*isValid)() const,
                      uint32_t (ProcessInstanceInfo::*getID)() const,
diff --git a/lldb/source/Utility/Reproducer.cpp b/lldb/source/Utility/Reproducer.cpp
index e0806f5f5981d..8a28e9b13675a 100644
--- a/lldb/source/Utility/Reproducer.cpp
+++ b/lldb/source/Utility/Reproducer.cpp
@@ -25,6 +25,16 @@ llvm::Error Reproducer::Initialize(ReproducerMode mode,
   lldbassert(!InstanceImpl() && "Already initialized.");
   InstanceImpl().emplace();
 
+  // The environment can override the capture mode.
+  if (mode != ReproducerMode::Replay) {
+    std::string env =
+        llvm::StringRef(getenv("LLDB_CAPTURE_REPRODUCER")).lower();
+    if (env == "0" || env == "off")
+      mode = ReproducerMode::Off;
+    else if (env == "1" || env == "on")
+      mode = ReproducerMode::Capture;
+  }
+
   switch (mode) {
   case ReproducerMode::Capture: {
     if (!root) {
diff --git a/lldb/source/Utility/Status.cpp b/lldb/source/Utility/Status.cpp
index 3b5094d64b75a..b74db72773dd4 100644
--- a/lldb/source/Utility/Status.cpp
+++ b/lldb/source/Utility/Status.cpp
@@ -100,14 +100,23 @@ static std::string RetrieveWin32ErrorString(uint32_t error_code) {
   char *buffer = nullptr;
   std::string message;
   // Retrieve win32 system error.
+  // First, attempt to load a en-US message
   if (::FormatMessageA(
           FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
               FORMAT_MESSAGE_MAX_WIDTH_MASK,
-          NULL, error_code, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+          NULL, error_code, MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US),
           (LPSTR)&buffer, 0, NULL)) {
     message.assign(buffer);
     ::LocalFree(buffer);
   }
+  // If the previous didn't work, use the default OS language
+  else if (::FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                                FORMAT_MESSAGE_FROM_SYSTEM |
+                                FORMAT_MESSAGE_MAX_WIDTH_MASK,
+                            NULL, error_code, 0, (LPSTR)&buffer, 0, NULL)) {
+    message.assign(buffer);
+    ::LocalFree(buffer);
+  }
   return message;
 }
 #endif
diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp
index c48a12acd9064..2ef4cd78ab034 100644
--- a/lldb/source/Utility/Stream.cpp
+++ b/lldb/source/Utility/Stream.cpp
@@ -160,65 +160,19 @@ Stream &Stream::operator<<(const void *p) {
   return *this;
 }
 
-// Stream a uint8_t "uval" out to this stream.
-Stream &Stream::operator<<(uint8_t uval) {
-  PutHex8(uval);
-  return *this;
-}
-
-// Stream a uint16_t "uval" out to this stream.
-Stream &Stream::operator<<(uint16_t uval) {
-  PutHex16(uval, m_byte_order);
-  return *this;
-}
-
-// Stream a uint32_t "uval" out to this stream.
-Stream &Stream::operator<<(uint32_t uval) {
-  PutHex32(uval, m_byte_order);
-  return *this;
-}
-
-// Stream a uint64_t "uval" out to this stream.
-Stream &Stream::operator<<(uint64_t uval) {
-  PutHex64(uval, m_byte_order);
-  return *this;
-}
-
-// Stream a int8_t "sval" out to this stream.
-Stream &Stream::operator<<(int8_t sval) {
-  Printf("%i", static_cast<int>(sval));
-  return *this;
-}
-
-// Stream a int16_t "sval" out to this stream.
-Stream &Stream::operator<<(int16_t sval) {
-  Printf("%i", static_cast<int>(sval));
-  return *this;
-}
-
-// Stream a int32_t "sval" out to this stream.
-Stream &Stream::operator<<(int32_t sval) {
-  Printf("%i", static_cast<int>(sval));
-  return *this;
-}
-
-// Stream a int64_t "sval" out to this stream.
-Stream &Stream::operator<<(int64_t sval) {
-  Printf("%" PRIi64, sval);
-  return *this;
-}
-
 // Get the current indentation level
-int Stream::GetIndentLevel() const { return m_indent_level; }
+unsigned Stream::GetIndentLevel() const { return m_indent_level; }
 
 // Set the current indentation level
-void Stream::SetIndentLevel(int indent_level) { m_indent_level = indent_level; }
+void Stream::SetIndentLevel(unsigned indent_level) {
+  m_indent_level = indent_level;
+}
 
 // Increment the current indentation level
-void Stream::IndentMore(int amount) { m_indent_level += amount; }
+void Stream::IndentMore(unsigned amount) { m_indent_level += amount; }
 
 // Decrement the current indentation level
-void Stream::IndentLess(int amount) {
+void Stream::IndentLess(unsigned amount) {
   if (m_indent_level >= amount)
     m_indent_level -= amount;
   else
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 06125a1aaeddb..9b1c3c12f172d 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -54,6 +54,11 @@ def find_shlibpath_var():
     lit_config.warning("unable to inject shared library path on '{}'".format(
         platform.system()))
 
+# Propagate LLDB_CAPTURE_REPRODUCER
+if 'LLDB_CAPTURE_REPRODUCER' in os.environ:
+  config.environment['LLDB_CAPTURE_REPRODUCER'] = os.environ[
+      'LLDB_CAPTURE_REPRODUCER']
+
 # Clean the module caches in the test build directory. This is necessary in an
 # incremental build whenever clang changes underneath, so doing it once per
 # lit.py invocation is close enough.
diff --git a/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.dmp.yaml b/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.dmp.yaml
new file mode 100644
index 0000000000000..330a761d88b4c
--- /dev/null
+++ b/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.dmp.yaml
@@ -0,0 +1,37 @@
+--- !minidump
+Version:         0xA0BAA793
+Flags:           0x0000000000000800
+Streams:
+  - Type:            ThreadList
+    Threads:
+      - Thread Id:       0x00004034
+        Suspend Count:   0x00000001
+        Priority Class:  0x00000020
+        Environment Block: 0x00000000007E6000
+        Context:         0000000000000000
+        Stack:
+          Start of Memory Range: 0x00000000008FF758
+          Content:         00000000000000
+  - Type:            ModuleList
+    Modules:
+      - Base of Image:   0x0000000000C70000
+        Size of Image:   0x00002000
+        Time Date Stamp: 1574942531
+        Module Name:     'arm-fp-unwind.exe'
+        CodeView Record: ''
+        Reserved0:       0x0000000000008140
+  - Type:            SystemInfo
+    Processor Arch:  ARM
+    Processor Level: 2049
+    Processor Revision: 2564
+    Number of Processors: 8
+    Product type:    1
+    Major Version:   10
+    Build Number:    18362
+    Platform ID:     Win32NT
+    Suite Mask:      0x0100
+    CPU:
+      CPUID:           0xEB8C1004
+  - Type:            MiscInfo
+    Content:         54050000F7010000183800002EB9DF5D00000000000000006C0700002B0100006C0700000400000003000000002000000D000000000000000100000088FFFFFF46004C00450020005300740061006E0064006100720064002000540069006D00650000000000000000000000000000000000000000000000000000000000000000000A000000050004000000000000000000000046004C00450020004400610079006C0069006700680074002000540069006D00650000000000000000000000000000000000000000000000000000000000000000000300000005000300000000000000C4FFFFFF310038003300360032002E003200330039002E00610072006D006600720065002E0031003900680031005F00720065006C0065006100730065005F007300760063005F00700072006F00640031002E003100390030003600320038002D0031003600340031000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000064006200670063006F00720065002E0077006F0061002C00310030002E0030002E00310038003300360032002E003100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+...
diff --git a/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.exe.yaml b/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.exe.yaml
new file mode 100644
index 0000000000000..f3229060635f2
--- /dev/null
+++ b/lldb/test/Shell/Minidump/Windows/Inputs/arm-fp-unwind.exe.yaml
@@ -0,0 +1,92 @@
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4097
+  ImageBase:       4194304
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  CertificateTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Debug:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader:
+    RelativeVirtualAddress: 0
+    Size:            0
+header:
+  Machine:         IMAGE_FILE_MACHINE_ARMNT
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     38
+    SectionData:     2DE90048EB46ADF5007D684600F004F80DF5007DBDE8008800BE01784278415C805C08447047
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            entry
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            other
+    Value:           24
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test b/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test
new file mode 100644
index 0000000000000..35ea7c8a9de0e
--- /dev/null
+++ b/lldb/test/Shell/Minidump/Windows/arm-fp-unwind.test
@@ -0,0 +1,17 @@
+Test that unwind plans use the frame pointer register correctly.
+
+REQUIRES: arm
+
+RUN: yaml2obj %S/Inputs/arm-fp-unwind.exe.yaml > %T/arm-fp-unwind.exe
+RUN: yaml2obj %S/Inputs/arm-fp-unwind.dmp.yaml > %T/arm-fp-unwind.dmp
+RUN: %lldb -O "settings set target.exec-search-paths %T" \
+RUN:   -c %T/arm-fp-unwind.dmp -o "image show-unwind -a 0x00c71010" -b \
+RUN:   | FileCheck %s
+
+CHECK: Assembly language inspection UnwindPlan:
+CHECK-NEXT: This UnwindPlan originally sourced from EmulateInstructionARM
+CHECK-NEXT: This UnwindPlan is sourced from the compiler: no.
+CHECK-NEXT: This UnwindPlan is valid at all instruction locations: yes.
+CHECK-NEXT: row[0]:    0: CFA=sp +0 =>
+CHECK-NEXT: row[1]:    4: CFA=sp +8 => fp=[CFA-8] lr=[CFA-4]
+CHECK-NEXT: row[2]:    6: CFA=fp +8 => fp=[CFA-8] lr=[CFA-4]
diff --git a/lldb/test/Shell/ObjectFile/ELF/build-id-case.yaml b/lldb/test/Shell/ObjectFile/ELF/build-id-case.yaml
index f9786b3754f84..08366056947bf 100644
--- a/lldb/test/Shell/ObjectFile/ELF/build-id-case.yaml
+++ b/lldb/test/Shell/ObjectFile/ELF/build-id-case.yaml
@@ -4,8 +4,25 @@
 # RUN: llvm-objcopy --strip-all %t/.build-id/1b/8a73ac238390e32a7ff4ac8ebe4d6a41ecf5c9.debug %t/stripped.out
 # RUN: lldb-test object-file %t/stripped.out | FileCheck %s
 
+# CHECK: Name: .debug_abbrev
+# CHECK: Name: .debug_addr
+# CHECK: Name: .debug_aranges
 # CHECK: Name: .debug_frame
-# CHECK-NEXT: Type: dwarf-frame
+# CHECK: Name: .debug_info
+# CHECK: Name: .debug_line
+# CHECK: Name: .debug_line_str
+# CHECK: Name: .debug_loc
+# CHECK: Name: .debug_loclists
+# CHECK: Name: .debug_macinfo
+# CHECK: Name: .debug_macro
+# CHECK: Name: .debug_names
+# CHECK: Name: .debug_pubnames
+# CHECK: Name: .debug_pubtypes
+# CHECK: Name: .debug_ranges
+# CHECK: Name: .debug_rnglists
+# CHECK: Name: .debug_str
+# CHECK: Name: .debug_str_offsets
+# CHECK: Name: .debug_types
 
 --- !ELF
 FileHeader:
@@ -27,9 +44,62 @@ Sections:
     Address:         0x00000000004003D0
     AddressAlign:    0x0000000000000010
     Content:         DEADBEEFBAADF00D
+  - Name:            .debug_abbrev
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_addr
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_aranges
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
   - Name:            .debug_frame
     Type:            SHT_PROGBITS
-    AddressAlign:    0x0000000000000008
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_info
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_line
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_line_str
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_loc
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_loclists
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_macinfo
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_macro
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_names
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_pubnames
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_pubtypes
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_ranges
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_rnglists
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_str
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_str_offsets
+    Type:            SHT_PROGBITS
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_types
+    Type:            SHT_PROGBITS
     Content:         DEADBEEFBAADF00D
 Symbols:
   - Name:            main
diff --git a/lldb/test/Shell/ObjectFile/ELF/section-types.yaml b/lldb/test/Shell/ObjectFile/ELF/section-types.yaml
index 9f6b4c0533b91..caac76a789ce0 100644
--- a/lldb/test/Shell/ObjectFile/ELF/section-types.yaml
+++ b/lldb/test/Shell/ObjectFile/ELF/section-types.yaml
@@ -13,6 +13,12 @@
 # CHECK-LABEL: Name: .debug_types.dwo
 # CHECK-NEXT: Type: dwarf-types-dwo
 
+# CHECK-LABEL: Name: .debug_rnglists
+# CHECK-NEXT: Type: dwarf-rnglists
+
+# CHECK-LABEL: Name: .debug_rnglists.dwo
+# CHECK-NEXT: Type: dwarf-rnglists-dwo
+
 # CHECK-LABEL: Name: .debug_names
 # CHECK-NEXT: Type: dwarf-names
 
@@ -58,6 +64,14 @@ Sections:
     Type:            SHT_PROGBITS
     AddressAlign:    0x0000000000000001
     Content:         DEADBEEFBAADF00D
+  - Name:            .debug_rnglists
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         DEADBEEFBAADF00D
+  - Name:            .debug_rnglists.dwo
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x0000000000000001
+    Content:         DEADBEEFBAADF00D
   - Name:            .debug_names
     Type:            SHT_PROGBITS
     AddressAlign:    0x0000000000000001
diff --git a/lldb/test/Shell/ObjectFile/PECOFF/disassemble-thumb.yaml b/lldb/test/Shell/ObjectFile/PECOFF/disassemble-thumb.yaml
new file mode 100644
index 0000000000000..5515824e776bc
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/PECOFF/disassemble-thumb.yaml
@@ -0,0 +1,94 @@
+# REQUIRES: arm
+
+# RUN: yaml2obj %s > %t.exe
+# RUN: %lldb %t.exe -o "disassemble -b -n entry" -b | FileCheck %s
+
+# CHECK: {{.*}}.exe[0x401000] <+0>: 0x0040 lsls   r0, r0, #0x1
+# CHECK: {{.*}}.exe[0x401002] <+2>: 0x4770 bx     lr
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4097
+  ImageBase:       4194304
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  CertificateTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Debug:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader:
+    RelativeVirtualAddress: 0
+    Size:            0
+header:
+  Machine:         IMAGE_FILE_MACHINE_ARMNT
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     4
+    SectionData:     '40007047'
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+  - Name:            entry
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
diff --git a/lldb/test/Shell/ObjectFile/PECOFF/section-types.yaml b/lldb/test/Shell/ObjectFile/PECOFF/section-types.yaml
new file mode 100644
index 0000000000000..caf955500e09f
--- /dev/null
+++ b/lldb/test/Shell/ObjectFile/PECOFF/section-types.yaml
@@ -0,0 +1,92 @@
+# RUN: yaml2obj %s > %t
+# RUN: lldb-test object-file %t | FileCheck %s
+
+# CHECK-LABEL: Name: .text
+# CHECK-NEXT: Type: code
+
+# CHECK-LABEL: Name: .eh_fram
+# CHECK-NEXT: Type: eh-frame
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       4194304
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 6
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 6
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [ IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE, IMAGE_DLL_CHARACTERISTICS_NX_COMPAT, IMAGE_DLL_CHARACTERISTICS_NO_SEH, IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE ]
+  SizeOfStackReserve: 1048576
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+  ExportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ImportTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ResourceTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ExceptionTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  CertificateTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BaseRelocationTable:
+    RelativeVirtualAddress: 12288
+    Size:            12
+  Debug:
+    RelativeVirtualAddress: 0
+    Size:            0
+  Architecture:
+    RelativeVirtualAddress: 0
+    Size:            0
+  GlobalPtr:
+    RelativeVirtualAddress: 0
+    Size:            0
+  TlsTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  LoadConfigTable:
+    RelativeVirtualAddress: 0
+    Size:            0
+  BoundImport:
+    RelativeVirtualAddress: 0
+    Size:            0
+  IAT:
+    RelativeVirtualAddress: 0
+    Size:            0
+  DelayImportDescriptor:
+    RelativeVirtualAddress: 0
+    Size:            0
+  ClrRuntimeHeader:
+    RelativeVirtualAddress: 0
+    Size:            0
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     5
+    SectionData:     5589E55DC3
+  - Name:            .eh_fram
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     52
+    SectionData:     1400000000000000017A5200017C0801000C040488010000180000001C000000001040000500000000410E088502420D05000000
+  - Name:            .reloc
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  12288
+    VirtualSize:     12
+    SectionData:     002000000C00000020300000
+symbols:
+...
diff --git a/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test b/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test
new file mode 100644
index 0000000000000..a8e7bdec250e6
--- /dev/null
+++ b/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test
@@ -0,0 +1,20 @@
+# UNSUPPORTED: system-windows
+# This tests the LLDB_CAPTURE_REPRODUCER override.
+
+# RUN: %lldb -b -o 'reproducer status' --capture --capture-path %t.repro /bin/ls | FileCheck %s --check-prefix CAPTURE
+# RUN: %lldb -b -o 'reproducer status' --capture | FileCheck %s --check-prefix CAPTURE
+
+# RUN: LLDB_CAPTURE_REPRODUCER=1 %lldb -b -o 'reproducer status' | FileCheck %s --check-prefix CAPTURE
+# RUN: LLDB_CAPTURE_REPRODUCER=ON %lldb -b -o 'reproducer status' | FileCheck %s --check-prefix CAPTURE
+# RUN: LLDB_CAPTURE_REPRODUCER=on %lldb -b -o 'reproducer status' | FileCheck %s --check-prefix CAPTURE
+
+# RUN: LLDB_CAPTURE_REPRODUCER=0 %lldb -b -o 'reproducer status' --capture --capture-path %t.repro | FileCheck %s --check-prefix OFF
+# RUN: LLDB_CAPTURE_REPRODUCER=0 %lldb -b -o 'reproducer status' --capture | FileCheck %s --check-prefix OFF
+# RUN: LLDB_CAPTURE_REPRODUCER=OFF %lldb -b -o 'reproducer status' --capture --capture-path %t.repro | FileCheck %s --check-prefix OFF
+# RUN: LLDB_CAPTURE_REPRODUCER=off %lldb -b -o 'reproducer status' --capture | FileCheck %s --check-prefix OFF
+
+# RUN: LLDB_CAPTURE_REPRODUCER=bogus %lldb -b -o 'reproducer status' --capture | FileCheck %s --check-prefix CAPTURE
+# RUN: LLDB_CAPTURE_REPRODUCER=bogus %lldb -b -o 'reproducer status' | FileCheck %s --check-prefix OFF
+
+# CAPTURE: Reproducer is in capture mode.
+# OFF: Reproducer is off.
diff --git a/lldb/test/Shell/Reproducer/TestVersionCheck.test b/lldb/test/Shell/Reproducer/TestVersionCheck.test
new file mode 100644
index 0000000000000..e3fb60367cec2
--- /dev/null
+++ b/lldb/test/Shell/Reproducer/TestVersionCheck.test
@@ -0,0 +1,29 @@
+# REQUIRES: system-darwin
+
+# This tests the reproducer version check.
+
+# RUN: rm -rf %t.repro
+# RUN: %clang_host %S/Inputs/simple.c -g -o %t.out
+# RUN: %lldb -x -b -s %S/Inputs/FileCapture.in --capture --capture-path %t.repro %t.out | FileCheck %s --check-prefix CHECK --check-prefix CAPTURE
+
+# Make sure that replay works.
+# RUN: %lldb --replay %t.repro | FileCheck %s --check-prefix CHECK --check-prefix REPLAY
+
+# Change the reproducer version.
+# RUN: echo "bogus" >> %t.repro/version.txt
+
+# Make sure that replay works.
+# RUN: not %lldb --replay %t.repro 2>&1 | FileCheck %s --check-prefix ERROR
+
+# Make sure that we can circumvent the version check with -reproducer-skip-version-check.
+# RUN: %lldb --replay %t.repro -reproducer-skip-version-check | FileCheck %s --check-prefix CHECK --check-prefix REPLAY
+
+# CAPTURE: testing
+# REPLAY-NOT: testing
+
+# CHECK: Process {{.*}} exited
+
+# CAPTURE: Reproducer is in capture mode.
+# CAPTURE: Reproducer written
+
+# ERROR: error: reproducer replay failed: reproducer capture and replay version don't match
diff --git a/lldb/test/Shell/Reproducer/lit.local.cfg b/lldb/test/Shell/Reproducer/lit.local.cfg
index 5659f1baa06df..dbb37b199d781 100644
--- a/lldb/test/Shell/Reproducer/lit.local.cfg
+++ b/lldb/test/Shell/Reproducer/lit.local.cfg
@@ -1,2 +1,6 @@
 # Enable crash reports for the reproducer tests.
-del config.environment['LLVM_DISABLE_CRASH_REPORT']
+if 'LLVM_DISABLE_CRASH_REPORT' in config.environment:
+  del config.environment['LLVM_DISABLE_CRASH_REPORT']
+
+if 'LLDB_CAPTURE_REPRODUCER' in config.environment:
+  del config.environment['LLDB_CAPTURE_REPRODUCER']
diff --git a/lldb/test/Shell/SymbolFile/DWARF/array-sizes.s b/lldb/test/Shell/SymbolFile/DWARF/array-sizes.s
index f00fe2ad005d2..b810527b5535e 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/array-sizes.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/array-sizes.s
@@ -10,7 +10,7 @@
 # RUN: lldb-test symbols %t | FileCheck %s
 
 # CHECK: Variable{0x7fffffff0000001e}, name = "X"
-# CHECK-SAME: type = {7fffffff00000033} 0x{{[0-9a-f]*}} (char [56])
+# CHECK-SAME: type = {7fffffff00000033} 0x{{[0-9A-F]*}} (char [56])
 
 
 # Generated from "char X[47];"
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug_ranges.s b/lldb/test/Shell/SymbolFile/DWARF/debug_ranges.s
index bbe5cb220c2da..13eea1b80706e 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/debug_ranges.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug_ranges.s
@@ -3,16 +3,13 @@
 # RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
 # RUN: %lldb %t -o "image lookup -v -s lookup_ranges" -o exit | FileCheck %s
 
-# CHECK:  Function: id = {0x7fffffff0000001c}, name = "ranges", range = [0x0000000000000000-0x0000000000000004)
-# CHECK:    Blocks: id = {0x7fffffff0000001c}, range = [0x00000000-0x00000004)
-# CHECK-NEXT:       id = {0x7fffffff0000002d}, ranges = [0x00000001-0x00000002)[0x00000003-0x00000004)
+# CHECK:  Function: id = {0x7fffffff0000002b}, name = "ranges", range = [0x0000000000000000-0x0000000000000004)
+# CHECK:    Blocks: id = {0x7fffffff0000002b}, range = [0x00000000-0x00000004)
+# CHECK-NEXT:       id = {0x7fffffff0000003f}, ranges = [0x00000001-0x00000002)[0x00000003-0x00000004)
 
         .text
         .p2align 12
-        .globl  ranges
-        .type   ranges,@function
-ranges:                                    # @ranges
-.Lfoo_begin:
+ranges:
         nop
 .Lblock1_begin:
 lookup_ranges:
@@ -22,21 +19,14 @@ lookup_ranges:
 .Lblock2_begin:
         nop
 .Lblock2_end:
-.Lfunc_end0:
-        .size   ranges, .Lfunc_end0-ranges
-                                        # -- End function
-        .section        .debug_str,"MS",@progbits,1
-.Lproducer:
-        .asciz  "Hand-written DWARF"
-.Lranges:
-        .asciz  "ranges"
+.Lranges_end:
 
         .section        .debug_abbrev,"",@progbits
         .byte   1                       # Abbreviation Code
         .byte   17                      # DW_TAG_compile_unit
         .byte   1                       # DW_CHILDREN_yes
         .byte   37                      # DW_AT_producer
-        .byte   14                      # DW_FORM_strp
+        .byte   8                       # DW_FORM_string
         .byte   17                      # DW_AT_low_pc
         .byte   1                       # DW_FORM_addr
         .byte   18                      # DW_AT_high_pc
@@ -51,7 +41,7 @@ lookup_ranges:
         .byte   18                      # DW_AT_high_pc
         .byte   6                       # DW_FORM_data4
         .byte   3                       # DW_AT_name
-        .byte   14                      # DW_FORM_strp
+        .byte   8                       # DW_FORM_string
         .byte   0                       # EOM(1)
         .byte   0                       # EOM(2)
         .byte   5                       # Abbreviation Code
@@ -71,13 +61,13 @@ lookup_ranges:
         .long   .debug_abbrev           # Offset Into Abbrev. Section
         .byte   8                       # Address Size (in bytes)
         .byte   1                       # Abbrev [1] 0xb:0x7b DW_TAG_compile_unit
-        .long   .Lproducer              # DW_AT_producer
-        .quad   .Lfoo_begin             # DW_AT_low_pc
-        .long   .Lfunc_end0-.Lfoo_begin # DW_AT_high_pc
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .quad   ranges                  # DW_AT_low_pc
+        .long   .Lranges_end-ranges     # DW_AT_high_pc
         .byte   2                       # Abbrev [2] 0x2a:0x4d DW_TAG_subprogram
-        .quad   .Lfoo_begin             # DW_AT_low_pc
-        .long   .Lfunc_end0-.Lfoo_begin # DW_AT_high_pc
-        .long   .Lranges                # DW_AT_name
+        .quad   ranges                  # DW_AT_low_pc
+        .long   .Lranges_end-ranges     # DW_AT_high_pc
+        .asciz  "ranges"                # DW_AT_name
         .byte   5                       # Abbrev [5] 0x61:0x15 DW_TAG_lexical_block
         .long   .Ldebug_ranges0         # DW_AT_ranges
         .byte   0                       # End Of Children Mark
@@ -86,9 +76,9 @@ lookup_ranges:
 
         .section        .debug_ranges,"",@progbits
 .Ldebug_ranges0:
-        .quad   .Lblock1_begin-.Lfoo_begin  
-        .quad   .Lblock1_end-.Lfoo_begin  
-        .quad   .Lblock2_begin-.Lfoo_begin  
-        .quad   .Lblock2_end-.Lfoo_begin  
+        .quad   .Lblock1_begin-ranges
+        .quad   .Lblock1_end-ranges
+        .quad   .Lblock2_begin-ranges
+        .quad   .Lblock2_end-ranges
         .quad   0
         .quad   0
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug_rnglists.s b/lldb/test/Shell/SymbolFile/DWARF/debug_rnglists.s
index 5d95b80e8da6a..1d718054a5877 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/debug_rnglists.s
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug_rnglists.s
@@ -1,18 +1,22 @@
 # REQUIRES: x86
 
 # RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
-# RUN: %lldb %t -o "image lookup -v -s lookup_rnglists" -o exit | FileCheck %s
+# RUN: %lldb %t -o "image lookup -v -s lookup_rnglists" \
+# RUN:   -o "image lookup -v -s lookup_rnglists2" -o exit | FileCheck %s
 
-# CHECK:  Function: id = {0x7fffffff00000021}, name = "rnglists", range = [0x0000000000000000-0x0000000000000004)
-# CHECK:    Blocks: id = {0x7fffffff00000021}, range = [0x00000000-0x00000004)
-# CHECK-NEXT:       id = {0x7fffffff00000032}, ranges = [0x00000001-0x00000002)[0x00000003-0x00000004)
+# CHECK-LABEL: image lookup -v -s lookup_rnglists
+# CHECK:  Function: id = {0x7fffffff00000030}, name = "rnglists", range = [0x0000000000000000-0x0000000000000004)
+# CHECK:    Blocks: id = {0x7fffffff00000030}, range = [0x00000000-0x00000004)
+# CHECK-NEXT:       id = {0x7fffffff00000046}, ranges = [0x00000001-0x00000002)[0x00000003-0x00000004)
+
+# CHECK-LABEL: image lookup -v -s lookup_rnglists2
+# CHECK:  Function: id = {0x7fffffff0000007a}, name = "rnglists2", range = [0x0000000000000004-0x0000000000000007)
+# CHECK:    Blocks: id = {0x7fffffff0000007a}, range = [0x00000004-0x00000007)
+# CHECK-NEXT:       id = {0x7fffffff00000091}, range = [0x00000005-0x00000007)
 
         .text
         .p2align 12
-        .globl  rnglists
-        .type   rnglists,@function
-rnglists:                                    # @rnglists
-.Lfoo_begin:
+rnglists:
         nop
 .Lblock1_begin:
 lookup_rnglists:
@@ -22,21 +26,23 @@ lookup_rnglists:
 .Lblock2_begin:
         nop
 .Lblock2_end:
-.Lfunc_end0:
-        .size   rnglists, .Lfunc_end0-rnglists
-                                        # -- End function
-        .section        .debug_str,"MS",@progbits,1
-.Lproducer:
-        .asciz  "Hand-written DWARF"
-.Lrnglists:
-        .asciz  "rnglists"
+.Lrnglists_end:
+
+rnglists2:
+        nop
+.Lblock3_begin:
+lookup_rnglists2:
+        nop
+        nop
+.Lblock3_end:
+.Lrnglists2_end:
 
         .section        .debug_abbrev,"",@progbits
         .byte   1                       # Abbreviation Code
         .byte   17                      # DW_TAG_compile_unit
         .byte   1                       # DW_CHILDREN_yes
         .byte   37                      # DW_AT_producer
-        .byte   14                      # DW_FORM_strp
+        .byte   8                       # DW_FORM_string
         .byte   17                      # DW_AT_low_pc
         .byte   1                       # DW_FORM_addr
         .byte   18                      # DW_AT_high_pc
@@ -53,7 +59,7 @@ lookup_rnglists:
         .byte   18                      # DW_AT_high_pc
         .byte   6                       # DW_FORM_data4
         .byte   3                       # DW_AT_name
-        .byte   14                      # DW_FORM_strp
+        .byte   8                       # DW_FORM_string
         .byte   0                       # EOM(1)
         .byte   0                       # EOM(2)
         .byte   5                       # Abbreviation Code
@@ -74,20 +80,42 @@ lookup_rnglists:
         .byte   8                       # Address Size (in bytes)
         .long   .debug_abbrev           # Offset Into Abbrev. Section
         .byte   1                       # Abbrev [1] 0xc:0x5f DW_TAG_compile_unit
-        .long   .Lproducer              # DW_AT_producer
-        .quad   .Lfoo_begin             # DW_AT_low_pc
-        .long   .Lfunc_end0-.Lfoo_begin # DW_AT_high_pc
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .quad   rnglists                # DW_AT_low_pc
+        .long   .Lrnglists_end-rnglists # DW_AT_high_pc
         .long   .Lrnglists_table_base0  # DW_AT_rnglists_base
         .byte   2                       # Abbrev [2] 0x2b:0x37 DW_TAG_subprogram
-        .quad   .Lfoo_begin             # DW_AT_low_pc
-        .long   .Lfunc_end0-.Lfoo_begin # DW_AT_high_pc
-        .long   .Lrnglists              # DW_AT_name
+        .quad   rnglists                # DW_AT_low_pc
+        .long   .Lrnglists_end-rnglists # DW_AT_high_pc
+        .asciz  "rnglists"              # DW_AT_name
         .byte   5                       # Abbrev [5] 0x52:0xf DW_TAG_lexical_block
         .byte   0                       # DW_AT_ranges
         .byte   0                       # End Of Children Mark
         .byte   0                       # End Of Children Mark
 .Ldebug_info_end0:
 
+.Lcu_begin1:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   1                       # Abbrev [1] 0xc:0x5f DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .quad   rnglists2               # DW_AT_low_pc
+        .long   .Lrnglists2_end-rnglists2 # DW_AT_high_pc
+        .long   .Lrnglists_table_base1  # DW_AT_rnglists_base
+        .byte   2                       # Abbrev [2] 0x2b:0x37 DW_TAG_subprogram
+        .quad   rnglists2               # DW_AT_low_pc
+        .long   .Lrnglists2_end-rnglists2 # DW_AT_high_pc
+        .asciz  "rnglists2"             # DW_AT_name
+        .byte   5                       # Abbrev [5] 0x52:0xf DW_TAG_lexical_block
+        .byte   0                       # DW_AT_ranges
+        .byte   0                       # End Of Children Mark
+        .byte   0                       # End Of Children Mark
+.Ldebug_info_end1:
+
         .section        .debug_rnglists,"",@progbits
         .long   .Ldebug_rnglist_table_end0-.Ldebug_rnglist_table_start0 # Length
 .Ldebug_rnglist_table_start0:
@@ -99,12 +127,25 @@ lookup_rnglists:
         .long   .Ldebug_ranges0-.Lrnglists_table_base0
 .Ldebug_ranges0:
         .byte   4                       # DW_RLE_offset_pair
-        .uleb128 .Lblock1_begin-.Lfoo_begin #   starting offset
-        .uleb128 .Lblock1_end-.Lfoo_begin #   ending offset
+        .uleb128 .Lblock1_begin-rnglists  #   starting offset
+        .uleb128 .Lblock1_end-rnglists    #   ending offset
         .byte   4                       # DW_RLE_offset_pair
-        .uleb128 .Lblock2_begin-.Lfoo_begin #   starting offset
-        .uleb128 .Lblock2_end-.Lfoo_begin #   ending offset
+        .uleb128 .Lblock2_begin-rnglists  #   starting offset
+        .uleb128 .Lblock2_end-rnglists    #   ending offset
         .byte   0                       # DW_RLE_end_of_list
 .Ldebug_rnglist_table_end0:
-        .section        .debug_macinfo,"",@progbits
-        .byte   0                       # End Of Macro List Mark
+
+        .long   .Ldebug_rnglist_table_end1-.Ldebug_rnglist_table_start1 # Length
+.Ldebug_rnglist_table_start1:
+        .short  5                       # Version
+        .byte   8                       # Address size
+        .byte   0                       # Segment selector size
+        .long   1                       # Offset entry count
+.Lrnglists_table_base1:
+        .long   .Ldebug_ranges1-.Lrnglists_table_base1
+.Ldebug_ranges1:
+        .byte   4                       # DW_RLE_offset_pair
+        .uleb128 .Lblock3_begin-rnglists2 #   starting offset
+        .uleb128 .Lblock3_end-rnglists2   #   ending offset
+        .byte   0                       # DW_RLE_end_of_list
+.Ldebug_rnglist_table_end1:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/dwarf5-debug_line.s b/lldb/test/Shell/SymbolFile/DWARF/dwarf5-debug_line.s
new file mode 100644
index 0000000000000..d15f31039bbd6
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/dwarf5-debug_line.s
@@ -0,0 +1,129 @@
+# Test handling of DWARF5 line tables. In particular, test that we handle files
+# which are present in the line table more than once.
+
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -o %t -triple x86_64-pc-linux %s
+# RUN: %lldb %t -o "source info -f file0.c" -o "source info -f file1.c" \
+# RUN:   -o "breakpoint set -f file0.c -l 42" \
+# RUN:   -o "breakpoint set -f file0.c -l 47" \
+# RUN:   -o exit | FileCheck %s
+
+# CHECK-LABEL: source info -f file0.c
+# CHECK: [0x0000000000000000-0x0000000000000001): /file0.c:42
+# CHECK-LABEL: source info -f file1.c
+# CHECK: [0x0000000000000001-0x0000000000000002): /file1.c:47
+# CHECK-LABEL: breakpoint set -f file0.c -l 42
+# CHECK: Breakpoint 1: {{.*}}`foo,
+# CHECK-LABEL: breakpoint set -f file0.c -l 47
+# CHECK: Breakpoint 2: {{.*}}`foo + 2,
+
+        .text
+        .globl  foo
+foo:
+        nop
+        nop
+        nop
+.Lfoo_end:
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   0                       # DW_CHILDREN_no
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   19                      # DW_AT_language
+        .byte   5                       # DW_FORM_data2
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   16                      # DW_AT_stmt_list
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   27                      # DW_AT_comp_dir
+        .byte   8                       # DW_FORM_string
+        .byte   17                      # DW_AT_low_pc
+        .byte   1                       # DW_FORM_addr
+        .byte   18                      # DW_AT_high_pc
+        .byte   6                       # DW_FORM_data4
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   1                       # Abbrev [1] 0xc:0x23 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .asciz  "file0.c"               # DW_AT_name
+        .long   .Lline_table_begin      # DW_AT_stmt_list
+        .asciz  "/"                     # DW_AT_comp_dir
+        .quad   foo                     # DW_AT_low_pc
+        .long   .Lfoo_end-foo           # DW_AT_high_pc
+.Ldebug_info_end0:
+
+        .section        .debug_line,"",@progbits
+.Lline_table_begin:
+        .long .Lline_end-.Lline_start
+.Lline_start:
+        .short  5                       # DWARF version number
+        .byte   8                       # Address Size (in bytes)
+        .byte   0                       # Segment Selector Size
+        .long   .Lheader_end-.Lheader_start
+.Lheader_start:
+        .byte   1                       # Minimum Instruction Length
+        .byte   1                       # Maximum Operations per Instruction
+        .byte   1                       # Default is_stmt
+        .byte   0                       # Line Base
+        .byte   0                       # Line Range
+        .byte   5                       # Opcode Base
+        .byte   0, 1, 1, 1              # Standard Opcode Lengths
+
+        # Directory table format
+        .byte   1                       # One element per directory entry
+        .byte   1                       # DW_LNCT_path
+        .byte   0x08                    # DW_FORM_string
+
+        # Directory table entries
+        .byte   1                       # 1 directory
+        .asciz  "/"
+
+        # File table format
+        .byte   2                       # 2 elements per file entry
+        .byte   1                       # DW_LNCT_path
+        .byte   0x08                    # DW_FORM_string
+        .byte   2                       # DW_LNCT_directory_index
+        .byte   0x0b                    # DW_FORM_data1
+
+        # File table entries
+        .byte   3                       # 3 files
+        .asciz  "file0.c"
+        .byte   0
+        .asciz  "file1.c"
+        .byte   0
+        .asciz  "file0.c"
+        .byte   0
+.Lheader_end:
+
+        .byte   4, 0                    # DW_LNS_set_file 0
+        .byte   0, 9, 2                 # DW_LNE_set_address
+        .quad   foo
+        .byte   3, 41                   # DW_LNS_advance_line 41
+        .byte   1                       # DW_LNS_copy
+
+        .byte   4, 1                    # DW_LNS_set_file 1
+        .byte   2, 1                    # DW_LNS_advance_pc 1
+        .byte   3, 5                    # DW_LNS_advance_line 5
+        .byte   1                       # DW_LNS_copy
+
+        .byte   4, 2                    # DW_LNS_set_file 2
+        .byte   2, 1                    # DW_LNS_advance_pc 1
+        .byte   1                       # DW_LNS_copy
+
+        .byte   2, 1                    # DW_LNS_advance_pc 1
+        .byte   0, 1, 1                 # DW_LNE_end_sequence
+.Lline_end:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/win-i386-line-table.s b/lldb/test/Shell/SymbolFile/DWARF/win-i386-line-table.s
new file mode 100644
index 0000000000000..2fa5ba5352b6b
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/win-i386-line-table.s
@@ -0,0 +1,55 @@
+# Test that lldb can read a line table for an architecture with a different
+# address size than the one that of the host.
+
+# REQUIRES: lld, x86
+
+# RUN: llvm-mc -triple i686-windows-gnu %s -filetype=obj > %t.o
+# RUN: lld-link %t.o -out:%t.exe -debug:dwarf -entry:entry -subsystem:console -lldmingw
+# RUN: %lldb %t.exe -o "image dump line-table -v win-i386-line-table.c" -b | FileCheck %s
+
+# CHECK: Line table for win-i386-line-table.c in `win-i386-line-table.s.tmp.exe
+# CHECK: 0x00401000: win-i386-line-table.c:2:1
+# CHECK: 0x00401001: win-i386-line-table.c:2:1
+
+        .text
+        .file   "win-i386-line-table.c"
+        .globl  _entry                  # -- Begin function entry
+_entry:                                 # @entry
+        .file   1 "" "win-i386-line-table.c"
+        .loc    1 1 0                   # win-i386-line-table.c:1:0
+        .cfi_sections .debug_frame
+        .cfi_startproc
+        .loc    1 2 1 prologue_end      # win-i386-line-table.c:2:1
+        retl
+        .cfi_endproc
+                                        # -- End function
+        .section        .debug_str,"dr"
+Linfo_string1:
+        .asciz  "win-i386-line-table.c"
+        .section        .debug_abbrev,"dr"
+Lsection_abbrev:
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   3                       # DW_AT_name
+        .byte   14                      # DW_FORM_strp
+        .byte   16                      # DW_AT_stmt_list
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+        .section        .debug_info,"dr"
+Lsection_info:
+Lcu_begin0:
+        .long   Ldebug_info_end0-Ldebug_info_start0 # Length of Unit
+Ldebug_info_start0:
+        .short  4                       # DWARF version number
+        .secrel32       Lsection_abbrev # Offset Into Abbrev. Section
+        .byte   4                       # Address Size (in bytes)
+        .byte   1                       # Abbrev [1] 0xb:0x2d DW_TAG_compile_unit
+        .secrel32       Linfo_string1   # DW_AT_name
+        .secrel32       Lline_table_start0 # DW_AT_stmt_list
+        .byte   0                       # End Of Children Mark
+Ldebug_info_end0:
+        .section        .debug_line,"dr"
+Lline_table_start0:
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 84c5b730dd31e..68891e600169e 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -38,6 +38,10 @@
 # test_exec_root: The root path where tests should be run.
 config.test_exec_root = os.path.join(config.lldb_obj_root, 'test')
 
+# Propagate LLDB_CAPTURE_REPRODUCER
+if 'LLDB_CAPTURE_REPRODUCER' in os.environ:
+  config.environment['LLDB_CAPTURE_REPRODUCER'] = os.environ[
+      'LLDB_CAPTURE_REPRODUCER']
 
 llvm_config.use_default_substitutions()
 toolchain.use_lldb_substitutions(config)
diff --git a/lldb/tools/debugserver/source/DNB.cpp b/lldb/tools/debugserver/source/DNB.cpp
index c9f2e34e2798c..8d9c691f9d337 100644
--- a/lldb/tools/debugserver/source/DNB.cpp
+++ b/lldb/tools/debugserver/source/DNB.cpp
@@ -1722,6 +1722,8 @@ nub_bool_t DNBSetArchitecture(const char *arch) {
     else if (strstr(arch, "arm64_32") == arch || 
              strstr(arch, "aarch64_32") == arch)
       return DNBArchProtocol::SetArchitecture(CPU_TYPE_ARM64_32);
+    else if (strstr(arch, "arm64e") == arch)
+      return DNBArchProtocol::SetArchitecture(CPU_TYPE_ARM64);
     else if (strstr(arch, "arm64") == arch || strstr(arch, "armv8") == arch ||
              strstr(arch, "aarch64") == arch)
       return DNBArchProtocol::SetArchitecture(CPU_TYPE_ARM64);
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
index 1bf14d97056ce..e8c40910567ca 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
@@ -26,6 +26,10 @@
 #include <inttypes.h>
 #include <sys/sysctl.h>
 
+#if __has_feature(ptrauth_calls)
+#include <ptrauth.h>
+#endif
+
 // Break only in privileged or user mode
 // (PAC bits in the DBGWVRn_EL1 watchpoint control register)
 #define S_USER ((uint32_t)(2u << 1))
@@ -93,7 +97,11 @@ uint32_t DNBArchMachARM64::GetCPUType() { return CPU_TYPE_ARM64; }
 uint64_t DNBArchMachARM64::GetPC(uint64_t failValue) {
   // Get program counter
   if (GetGPRState(false) == KERN_SUCCESS)
+#if defined(__LP64__)
+    return arm_thread_state64_get_pc(m_state.context.gpr);
+#else
     return m_state.context.gpr.__pc;
+#endif
   return failValue;
 }
 
@@ -101,7 +109,17 @@ kern_return_t DNBArchMachARM64::SetPC(uint64_t value) {
   // Get program counter
   kern_return_t err = GetGPRState(false);
   if (err == KERN_SUCCESS) {
+#if defined(__LP64__)
+#if __has_feature(ptrauth_calls)
+    // The incoming value could be garbage.  Strip it to avoid
+    // trapping when it gets resigned in the thread state.
+    value = (uint64_t) ptrauth_strip((void*) value, ptrauth_key_function_pointer);
+    value = (uint64_t) ptrauth_sign_unauthenticated((void*) value, ptrauth_key_function_pointer, 0);
+#endif
+    arm_thread_state64_set_pc_fptr (m_state.context.gpr, (void*) value);
+#else
     m_state.context.gpr.__pc = value;
+#endif
     err = SetGPRState();
   }
   return err == KERN_SUCCESS;
@@ -110,7 +128,11 @@ kern_return_t DNBArchMachARM64::SetPC(uint64_t value) {
 uint64_t DNBArchMachARM64::GetSP(uint64_t failValue) {
   // Get stack pointer
   if (GetGPRState(false) == KERN_SUCCESS)
+#if defined(__LP64__)
+    return arm_thread_state64_get_sp(m_state.context.gpr);
+#else
     return m_state.context.gpr.__sp;
+#endif
   return failValue;
 }
 
@@ -167,8 +189,15 @@ kern_return_t DNBArchMachARM64::GetGPRState(bool force) {
         x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[0], x[11],
         x[12], x[13], x[14], x[15], x[16], x[17], x[18], x[19], x[20], x[21],
         x[22], x[23], x[24], x[25], x[26], x[27], x[28],
+#if defined(__LP64__)
+        (uint64_t) arm_thread_state64_get_fp (m_state.context.gpr),
+        (uint64_t) arm_thread_state64_get_lr (m_state.context.gpr),
+        (uint64_t) arm_thread_state64_get_sp (m_state.context.gpr),
+        (uint64_t) arm_thread_state64_get_pc (m_state.context.gpr),
+#else
         m_state.context.gpr.__fp, m_state.context.gpr.__lr,
         m_state.context.gpr.__sp, m_state.context.gpr.__pc,
+#endif
         m_state.context.gpr.__cpsr);
   }
   m_state.SetError(set, Read, kret);
@@ -564,12 +593,20 @@ kern_return_t DNBArchMachARM64::EnableHardwareSingleStep(bool enable) {
   if (enable) {
     DNBLogThreadedIf(LOG_STEP,
                      "%s: Setting MDSCR_EL1 Single Step bit at pc 0x%llx",
+#if defined(__LP64__)
+                     __FUNCTION__, (uint64_t)arm_thread_state64_get_pc (m_state.context.gpr));
+#else
                      __FUNCTION__, (uint64_t)m_state.context.gpr.__pc);
+#endif
     m_state.dbg.__mdscr_el1 |= SS_ENABLE;
   } else {
     DNBLogThreadedIf(LOG_STEP,
                      "%s: Clearing MDSCR_EL1 Single Step bit at pc 0x%llx",
+#if defined(__LP64__)
+                     __FUNCTION__, (uint64_t)arm_thread_state64_get_pc (m_state.context.gpr));
+#else
                      __FUNCTION__, (uint64_t)m_state.context.gpr.__pc);
+#endif
     m_state.dbg.__mdscr_el1 &= ~(SS_ENABLE);
   }
 
@@ -1409,10 +1446,28 @@ const DNBRegisterInfo DNBArchMachARM64::g_gpr_registers[] = {
     DEFINE_GPR_IDX(26, x26, NULL, INVALID_NUB_REGNUM),
     DEFINE_GPR_IDX(27, x27, NULL, INVALID_NUB_REGNUM),
     DEFINE_GPR_IDX(28, x28, NULL, INVALID_NUB_REGNUM),
-    DEFINE_GPR_NAME(fp, "x29", GENERIC_REGNUM_FP),
-    DEFINE_GPR_NAME(lr, "x30", GENERIC_REGNUM_RA),
-    DEFINE_GPR_NAME(sp, "xsp", GENERIC_REGNUM_SP),
-    DEFINE_GPR_NAME(pc, NULL, GENERIC_REGNUM_PC),
+    // For the G/g packet we want to show where the offset into the regctx
+    // is for fp/lr/sp/pc, but we cannot directly access them on arm64e
+    // devices (and therefore can't offsetof() them)) - add the offset based
+    // on the last accessible register by hand for advertising the location
+    // in the regctx to lldb.  We'll go through the accessor functions when
+    // we read/write them here.
+    {
+       e_regSetGPR, gpr_fp, "fp", "x29", Uint, Hex, 8, GPR_OFFSET_IDX(28) + 8,
+       dwarf_fp, dwarf_fp, GENERIC_REGNUM_FP, debugserver_gpr_fp, NULL, NULL
+    },
+    {
+       e_regSetGPR, gpr_lr, "lr", "x30", Uint, Hex, 8, GPR_OFFSET_IDX(28) + 16,
+       dwarf_lr, dwarf_lr, GENERIC_REGNUM_RA, debugserver_gpr_lr, NULL, NULL
+    },
+    {
+       e_regSetGPR, gpr_sp, "sp", "xsp", Uint, Hex, 8, GPR_OFFSET_IDX(28) + 24,
+       dwarf_sp, dwarf_sp, GENERIC_REGNUM_SP, debugserver_gpr_sp, NULL, NULL
+    },
+    {
+       e_regSetGPR, gpr_pc, "pc", NULL, Uint, Hex, 8, GPR_OFFSET_IDX(28) + 32,
+       dwarf_pc, dwarf_pc, GENERIC_REGNUM_PC, debugserver_gpr_pc, NULL, NULL
+    },
 
     // in armv7 we specify that writing to the CPSR should invalidate r8-12, sp,
     // lr.
@@ -1769,7 +1824,20 @@ bool DNBArchMachARM64::GetRegisterValue(uint32_t set, uint32_t reg,
     switch (set) {
     case e_regSetGPR:
       if (reg <= gpr_pc) {
+#if defined(__LP64__)
+        if (reg == gpr_pc)
+          value->value.uint64 = arm_thread_state64_get_pc (m_state.context.gpr);
+        else if (reg == gpr_lr)
+          value->value.uint64 = arm_thread_state64_get_lr (m_state.context.gpr);
+        else if (reg == gpr_sp)
+          value->value.uint64 = arm_thread_state64_get_sp (m_state.context.gpr);
+        else if (reg == gpr_fp)
+          value->value.uint64 = arm_thread_state64_get_fp (m_state.context.gpr);
+        else
+        value->value.uint64 = m_state.context.gpr.__x[reg];
+#else
         value->value.uint64 = m_state.context.gpr.__x[reg];
+#endif
         return true;
       } else if (reg == gpr_cpsr) {
         value->value.uint32 = m_state.context.gpr.__cpsr;
@@ -1859,7 +1927,27 @@ bool DNBArchMachARM64::SetRegisterValue(uint32_t set, uint32_t reg,
     switch (set) {
     case e_regSetGPR:
       if (reg <= gpr_pc) {
+#if defined(__LP64__)
+          uint64_t signed_value = value->value.uint64;
+#if __has_feature(ptrauth_calls)
+          // The incoming value could be garbage.  Strip it to avoid
+          // trapping when it gets resigned in the thread state.
+          signed_value = (uint64_t) ptrauth_strip((void*) signed_value, ptrauth_key_function_pointer);
+          signed_value = (uint64_t) ptrauth_sign_unauthenticated((void*) signed_value, ptrauth_key_function_pointer, 0);
+#endif
+        if (reg == gpr_pc) 
+         arm_thread_state64_set_pc_fptr (m_state.context.gpr, (void*) signed_value);
+        else if (reg == gpr_lr)
+          arm_thread_state64_set_lr_fptr (m_state.context.gpr, (void*) signed_value);
+        else if (reg == gpr_sp)
+          arm_thread_state64_set_sp (m_state.context.gpr, value->value.uint64);
+        else if (reg == gpr_fp)
+          arm_thread_state64_set_fp (m_state.context.gpr, value->value.uint64);
+        else
+          m_state.context.gpr.__x[reg] = value->value.uint64;
+#else
         m_state.context.gpr.__x[reg] = value->value.uint64;
+#endif
         success = true;
       } else if (reg == gpr_cpsr) {
         m_state.context.gpr.__cpsr = value->value.uint32;
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 61bdf0d8dac64..64e3bc49abc8c 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -4643,6 +4643,24 @@ static bool GetHostCPUType(uint32_t &cputype, uint32_t &cpusubtype,
   return g_host_cputype != 0;
 }
 
+static bool GetAddressingBits(uint32_t &addressing_bits) {
+  static uint32_t g_addressing_bits = 0;
+  static bool g_tried_addressing_bits_syscall = false;
+  if (g_tried_addressing_bits_syscall == false) {
+    size_t len = sizeof (uint32_t);
+    if (::sysctlbyname("machdep.virtual_address_size",
+          &g_addressing_bits, &len, NULL, 0) != 0) {
+      g_addressing_bits = 0;
+    }
+  }
+  g_tried_addressing_bits_syscall = true;
+  addressing_bits = g_addressing_bits;
+  if (addressing_bits > 0)
+    return true;
+  else
+    return false;
+}
+
 rnb_err_t RNBRemote::HandlePacket_qHostInfo(const char *p) {
   std::ostringstream strm;
 
@@ -4655,6 +4673,11 @@ rnb_err_t RNBRemote::HandlePacket_qHostInfo(const char *p) {
     strm << "cpusubtype:" << std::dec << cpusubtype << ';';
   }
 
+  uint32_t addressing_bits = 0;
+  if (GetAddressingBits(addressing_bits)) {
+    strm << "addressing_bits:" << std::dec << addressing_bits << ';';
+  }
+
   // The OS in the triple should be "ios" or "macosx" which doesn't match our
   // "Darwin" which gets returned from "kern.ostype", so we need to hardcode
   // this for now.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index fe4a17762f8bc..73874389aa1bb 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -797,7 +797,9 @@ static void printHelp(LLDBOptTable &table, llvm::StringRef tool_name) {
 
 llvm::Optional<int> InitializeReproducer(opt::InputArgList &input_args) {
   if (auto *replay_path = input_args.getLastArg(OPT_replay)) {
-    if (const char *error = SBReproducer::Replay(replay_path->getValue())) {
+    const bool skip_version_check = input_args.hasArg(OPT_skip_version_check);
+    if (const char *error =
+            SBReproducer::Replay(replay_path->getValue(), skip_version_check)) {
       WithColor::error() << "reproducer replay failed: " << error << '\n';
       return 1;
     }
@@ -854,7 +856,7 @@ int main(int argc, char const *argv[]) {
   }
 
   // Register the reproducer signal handler.
-  llvm::sys::AddSignalHandler(reproducer_handler, (void *)(argv[0]));
+  llvm::sys::AddSignalHandler(reproducer_handler, const_cast<char *>(argv[0]));
 
   SBError error = SBDebugger::InitializeWithErrorHandling();
   if (error.Fail()) {
diff --git a/lldb/tools/driver/Options.td b/lldb/tools/driver/Options.td
index 485c0d44bc848..c237f568f64c4 100644
--- a/lldb/tools/driver/Options.td
+++ b/lldb/tools/driver/Options.td
@@ -232,5 +232,7 @@ def capture_path: Separate<["--", "-"], "capture-path">,
 def replay: Separate<["--", "-"], "replay">,
   MetaVarName<"<filename>">,
   HelpText<"Tells the debugger to replay a reproducer from <filename>.">;
+def skip_version_check: F<"reproducer-skip-version-check">,
+  HelpText<"Skip the reproducer version check.">;
 
 def REM : R<["--"], "">;
diff --git a/lldb/tools/lldb-test/lldb-test.cpp b/lldb/tools/lldb-test/lldb-test.cpp
index 66c8536301d52..12e4a56059796 100644
--- a/lldb/tools/lldb-test/lldb-test.cpp
+++ b/lldb/tools/lldb-test/lldb-test.cpp
@@ -549,7 +549,8 @@ Error opts::symbols::findVariables(lldb_private::Module &Module) {
     CompUnitSP CU;
     for (size_t Ind = 0; !CU && Ind < Module.GetNumCompileUnits(); ++Ind) {
       CompUnitSP Candidate = Module.GetCompileUnitAtIndex(Ind);
-      if (!Candidate || Candidate->GetFilename().GetStringRef() != File)
+      if (!Candidate ||
+          Candidate->GetPrimaryFile().GetFilename().GetStringRef() != File)
         continue;
       if (CU)
         return make_string_error("Multiple compile units for file `{0}` found.",
@@ -653,7 +654,8 @@ Error opts::symbols::verify(lldb_private::Module &Module) {
     if (!comp_unit)
       return make_string_error("Connot parse compile unit {0}.", i);
 
-    outs() << "Processing '" << comp_unit->GetFilename().AsCString()
+    outs() << "Processing '"
+           << comp_unit->GetPrimaryFile().GetFilename().AsCString()
            << "' compile unit.\n";
 
     LineTable *lt = comp_unit->GetLineTable();
diff --git a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
index 150bef1590f4a..deb6c7d54ea9f 100644
--- a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
+++ b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp
@@ -191,6 +191,8 @@ TEST(CPlusPlusLanguage, FindAlternateFunctionManglings) {
   EXPECT_THAT(FindAlternate("_ZN1A1fEx"), Contains("_ZN1A1fEl"));
   EXPECT_THAT(FindAlternate("_ZN1A1fEy"), Contains("_ZN1A1fEm"));
   EXPECT_THAT(FindAlternate("_ZN1A1fEai"), Contains("_ZN1A1fEci"));
+  EXPECT_THAT(FindAlternate("_ZN1AC1Ev"), Contains("_ZN1AC2Ev"));
+  EXPECT_THAT(FindAlternate("_ZN1AD1Ev"), Contains("_ZN1AD2Ev"));
   EXPECT_THAT(FindAlternate("_bogus"), IsEmpty());
 }
 
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 12ffdfe79ec32..8bc510bd989aa 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -62,6 +62,14 @@ extern "C" void init_lldb(void) {}
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
 
+// Disable warning C4190: 'LLDBSwigPythonBreakpointCallbackFunction' has
+// C-linkage specified, but returns UDT 'llvm::Expected<bool>' which is
+// incompatible with C
+#if _MSC_VER
+#pragma warning (push)
+#pragma warning (disable : 4190)
+#endif
+
 extern "C" llvm::Expected<bool> LLDBSwigPythonBreakpointCallbackFunction(
     const char *python_function_name, const char *session_dictionary_name,
     const lldb::StackFrameSP &sb_frame,
@@ -70,6 +78,10 @@ extern "C" llvm::Expected<bool> LLDBSwigPythonBreakpointCallbackFunction(
   return false;
 }
 
+#if _MSC_VER
+#pragma warning (pop)
+#endif
+
 #pragma clang diagnostic pop
 
 extern "C" bool LLDBSwigPythonWatchpointCallbackFunction(
diff --git a/lldb/unittests/Symbol/TestClangASTContext.cpp b/lldb/unittests/Symbol/TestClangASTContext.cpp
index 44a824636cf73..8fb24acc7a6a1 100644
--- a/lldb/unittests/Symbol/TestClangASTContext.cpp
+++ b/lldb/unittests/Symbol/TestClangASTContext.cpp
@@ -169,10 +169,12 @@ TEST_F(TestClangASTContext, TestGetBasicTypeFromName) {
   EXPECT_EQ(GetBasicQualType(eBasicTypeNullPtr), GetBasicQualType("nullptr"));
 }
 
-void VerifyEncodingAndBitSize(clang::ASTContext *context,
+void VerifyEncodingAndBitSize(ClangASTContext &clang_context,
                               lldb::Encoding encoding, unsigned int bit_size) {
-  CompilerType type = ClangASTContext::GetBuiltinTypeForEncodingAndBitSize(
-      context, encoding, bit_size);
+  clang::ASTContext *context = clang_context.getASTContext();
+
+  CompilerType type =
+      clang_context.GetBuiltinTypeForEncodingAndBitSize(encoding, bit_size);
   EXPECT_TRUE(type.IsValid());
 
   QualType qtype = ClangUtil::GetQualType(type);
@@ -206,8 +208,6 @@ void VerifyEncodingAndBitSize(clang::ASTContext *context,
 }
 
 TEST_F(TestClangASTContext, TestBuiltinTypeForEncodingAndBitSize) {
-  clang::ASTContext *context = m_ast->getASTContext();
-
   // Make sure we can get types of every possible size in every possible
   // encoding.
   // We can't make any guarantee about which specific type we get, because the
@@ -215,20 +215,20 @@ TEST_F(TestClangASTContext, TestBuiltinTypeForEncodingAndBitSize) {
   // isn't that specific.  We only need to make sure the compiler hands us some
   // type that
   // is both a builtin type and matches the requested bit size.
-  VerifyEncodingAndBitSize(context, eEncodingSint, 8);
-  VerifyEncodingAndBitSize(context, eEncodingSint, 16);
-  VerifyEncodingAndBitSize(context, eEncodingSint, 32);
-  VerifyEncodingAndBitSize(context, eEncodingSint, 64);
-  VerifyEncodingAndBitSize(context, eEncodingSint, 128);
-
-  VerifyEncodingAndBitSize(context, eEncodingUint, 8);
-  VerifyEncodingAndBitSize(context, eEncodingUint, 16);
-  VerifyEncodingAndBitSize(context, eEncodingUint, 32);
-  VerifyEncodingAndBitSize(context, eEncodingUint, 64);
-  VerifyEncodingAndBitSize(context, eEncodingUint, 128);
-
-  VerifyEncodingAndBitSize(context, eEncodingIEEE754, 32);
-  VerifyEncodingAndBitSize(context, eEncodingIEEE754, 64);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingSint, 8);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingSint, 16);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingSint, 32);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingSint, 64);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingSint, 128);
+
+  VerifyEncodingAndBitSize(*m_ast, eEncodingUint, 8);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingUint, 16);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingUint, 32);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingUint, 64);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingUint, 128);
+
+  VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 32);
+  VerifyEncodingAndBitSize(*m_ast, eEncodingIEEE754, 64);
 }
 
 TEST_F(TestClangASTContext, TestIsClangType) {
diff --git a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
index 0470394d42555..e8a8690c1ff1f 100644
--- a/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
+++ b/lldb/unittests/SymbolFile/PDB/SymbolFilePDBTests.cpp
@@ -109,7 +109,7 @@ class SymbolFilePDBTests : public testing::Test {
                            const FileSpec &spec) const {
     for (size_t i = 0; i < sc_list.GetSize(); ++i) {
       const SymbolContext &sc = sc_list[i];
-      if (FileSpecMatchesAsBaseOrFull(*sc.comp_unit, spec))
+      if (FileSpecMatchesAsBaseOrFull(sc.comp_unit->GetPrimaryFile(), spec))
         return true;
     }
     return false;
diff --git a/lldb/unittests/Utility/ArchSpecTest.cpp b/lldb/unittests/Utility/ArchSpecTest.cpp
index 0186ff05ead8a..9115808c12587 100644
--- a/lldb/unittests/Utility/ArchSpecTest.cpp
+++ b/lldb/unittests/Utility/ArchSpecTest.cpp
@@ -216,6 +216,41 @@ TEST(ArchSpecTest, MergeFrom) {
     EXPECT_EQ(llvm::Triple::EnvironmentType::UnknownEnvironment,
               A.GetTriple().getEnvironment());
   }
+  {
+    ArchSpec A("arm--linux-eabihf");
+    ArchSpec B("armv8l--linux-gnueabihf");
+
+    EXPECT_TRUE(A.IsValid());
+    EXPECT_TRUE(B.IsValid());
+
+    EXPECT_EQ(llvm::Triple::ArchType::arm, A.GetTriple().getArch());
+    EXPECT_EQ(llvm::Triple::ArchType::arm, B.GetTriple().getArch());
+
+    EXPECT_EQ(ArchSpec::eCore_arm_generic, A.GetCore());
+    EXPECT_EQ(ArchSpec::eCore_arm_armv8l, B.GetCore());
+
+    EXPECT_EQ(llvm::Triple::VendorType::UnknownVendor,
+              A.GetTriple().getVendor());
+    EXPECT_EQ(llvm::Triple::VendorType::UnknownVendor,
+              B.GetTriple().getVendor());
+
+    EXPECT_EQ(llvm::Triple::OSType::Linux, A.GetTriple().getOS());
+    EXPECT_EQ(llvm::Triple::OSType::Linux, B.GetTriple().getOS());
+
+    EXPECT_EQ(llvm::Triple::EnvironmentType::EABIHF,
+              A.GetTriple().getEnvironment());
+    EXPECT_EQ(llvm::Triple::EnvironmentType::GNUEABIHF,
+              B.GetTriple().getEnvironment());
+
+    A.MergeFrom(B);
+    EXPECT_EQ(llvm::Triple::ArchType::arm, A.GetTriple().getArch());
+    EXPECT_EQ(ArchSpec::eCore_arm_armv8l, A.GetCore());
+    EXPECT_EQ(llvm::Triple::VendorType::UnknownVendor,
+              A.GetTriple().getVendor());
+    EXPECT_EQ(llvm::Triple::OSType::Linux, A.GetTriple().getOS());
+    EXPECT_EQ(llvm::Triple::EnvironmentType::EABIHF,
+              A.GetTriple().getEnvironment());
+  }
 }
 
 TEST(ArchSpecTest, MergeFromMachOUnknown) {
diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp
index 0f5b1652d2989..d5f1091d5d469 100644
--- a/lldb/unittests/Utility/FileSpecTest.cpp
+++ b/lldb/unittests/Utility/FileSpecTest.cpp
@@ -12,6 +12,14 @@
 
 using namespace lldb_private;
 
+static FileSpec PosixSpec(llvm::StringRef path) {
+  return FileSpec(path, FileSpec::Style::posix);
+}
+
+static FileSpec WindowsSpec(llvm::StringRef path) {
+  return FileSpec(path, FileSpec::Style::windows);
+}
+
 TEST(FileSpecTest, FileAndDirectoryComponents) {
   FileSpec fs_posix("/foo/bar", FileSpec::Style::posix);
   EXPECT_STREQ("/foo/bar", fs_posix.GetCString());
@@ -106,8 +114,7 @@ TEST(FileSpecTest, AppendPathComponent) {
 }
 
 TEST(FileSpecTest, CopyByAppendingPathComponent) {
-  FileSpec fs = FileSpec("/foo", FileSpec::Style::posix)
-                    .CopyByAppendingPathComponent("bar");
+  FileSpec fs = PosixSpec("/foo").CopyByAppendingPathComponent("bar");
   EXPECT_STREQ("/foo/bar", fs.GetCString());
   EXPECT_STREQ("/foo", fs.GetDirectory().GetCString());
   EXPECT_STREQ("bar", fs.GetFilename().GetCString());
@@ -136,9 +143,7 @@ TEST(FileSpecTest, PrependPathComponent) {
 }
 
 TEST(FileSpecTest, EqualSeparator) {
-  FileSpec backward("C:\\foo\\bar", FileSpec::Style::windows);
-  FileSpec forward("C:/foo/bar", FileSpec::Style::windows);
-  EXPECT_EQ(forward, backward);
+  EXPECT_EQ(WindowsSpec("C:\\foo\\bar"), WindowsSpec("C:/foo/bar"));
 }
 
 TEST(FileSpecTest, EqualDotsWindows) {
@@ -153,9 +158,8 @@ TEST(FileSpecTest, EqualDotsWindows) {
   };
 
   for (const auto &test : tests) {
-    FileSpec one(test.first, FileSpec::Style::windows);
-    FileSpec two(test.second, FileSpec::Style::windows);
-    EXPECT_EQ(one, two);
+    SCOPED_TRACE(llvm::Twine(test.first) + " <=> " + test.second);
+    EXPECT_EQ(WindowsSpec(test.first), WindowsSpec(test.second));
   }
 }
 
@@ -169,9 +173,8 @@ TEST(FileSpecTest, EqualDotsPosix) {
   };
 
   for (const auto &test : tests) {
-    FileSpec one(test.first, FileSpec::Style::posix);
-    FileSpec two(test.second, FileSpec::Style::posix);
-    EXPECT_EQ(one, two);
+    SCOPED_TRACE(llvm::Twine(test.first) + " <=> " + test.second);
+    EXPECT_EQ(PosixSpec(test.first), PosixSpec(test.second));
   }
 }
 
@@ -183,9 +186,8 @@ TEST(FileSpecTest, EqualDotsPosixRoot) {
   };
 
   for (const auto &test : tests) {
-    FileSpec one(test.first, FileSpec::Style::posix);
-    FileSpec two(test.second, FileSpec::Style::posix);
-    EXPECT_EQ(one, two);
+    SCOPED_TRACE(llvm::Twine(test.first) + " <=> " + test.second);
+    EXPECT_EQ(PosixSpec(test.first), PosixSpec(test.second));
   }
 }
 
@@ -200,7 +202,7 @@ TEST(FileSpecTest, GuessPathStyle) {
   EXPECT_EQ(llvm::None, FileSpec::GuessPathStyle("foo/bar.txt"));
 }
 
-TEST(FileSpecTest, GetNormalizedPath) {
+TEST(FileSpecTest, GetPath) {
   std::pair<const char *, const char *> posix_tests[] = {
       {"/foo/.././bar", "/bar"},
       {"/foo/./../bar", "/bar"},
@@ -230,8 +232,7 @@ TEST(FileSpecTest, GetNormalizedPath) {
   };
   for (auto test : posix_tests) {
     SCOPED_TRACE(llvm::Twine("test.first = ") + test.first);
-    EXPECT_EQ(test.second,
-              FileSpec(test.first, FileSpec::Style::posix).GetPath());
+    EXPECT_EQ(test.second, PosixSpec(test.first).GetPath());
   }
 
   std::pair<const char *, const char *> windows_tests[] = {
@@ -262,9 +263,8 @@ TEST(FileSpecTest, GetNormalizedPath) {
       {R"(..\..\foo)", R"(..\..\foo)"},
   };
   for (auto test : windows_tests) {
-    EXPECT_EQ(test.second,
-              FileSpec(test.first, FileSpec::Style::windows).GetPath())
-        << "Original path: " << test.first;
+    SCOPED_TRACE(llvm::Twine("test.first = ") + test.first);
+    EXPECT_EQ(test.second, WindowsSpec(test.first).GetPath());
   }
 }
 
@@ -315,8 +315,8 @@ TEST(FileSpecTest, IsRelative) {
     "/foo/../.",
   };
   for (const auto &path: not_relative) {
-    FileSpec spec(path, FileSpec::Style::posix);
-    EXPECT_FALSE(spec.IsRelative());
+    SCOPED_TRACE(path);
+    EXPECT_FALSE(PosixSpec(path).IsRelative());
   }
   llvm::StringRef is_relative[] = {
     ".",
@@ -333,8 +333,8 @@ TEST(FileSpecTest, IsRelative) {
     "./foo/bar.c"
   };
   for (const auto &path: is_relative) {
-    FileSpec spec(path, FileSpec::Style::posix);
-    EXPECT_TRUE(spec.IsRelative());
+    SCOPED_TRACE(path);
+    EXPECT_TRUE(PosixSpec(path).IsRelative());
   }
 }
 
@@ -379,3 +379,44 @@ TEST(FileSpecTest, RemoveLastPathComponent) {
   EXPECT_FALSE(fs_windows.RemoveLastPathComponent());
   EXPECT_STREQ("C:", fs_windows.GetCString());
 }
+
+TEST(FileSpecTest, Equal) {
+  auto Eq = [](const char *a, const char *b, bool full) {
+    return FileSpec::Equal(PosixSpec(a), PosixSpec(b), full);
+  };
+  EXPECT_TRUE(Eq("/foo/bar", "/foo/bar", true));
+  EXPECT_TRUE(Eq("/foo/bar", "/foo/bar", false));
+
+  EXPECT_FALSE(Eq("/foo/bar", "/foo/baz", true));
+  EXPECT_FALSE(Eq("/foo/bar", "/foo/baz", false));
+
+  EXPECT_FALSE(Eq("/bar/foo", "/baz/foo", true));
+  EXPECT_FALSE(Eq("/bar/foo", "/baz/foo", false));
+
+  EXPECT_FALSE(Eq("/bar/foo", "foo", true));
+  EXPECT_TRUE(Eq("/bar/foo", "foo", false));
+
+  EXPECT_FALSE(Eq("foo", "/bar/foo", true));
+  EXPECT_TRUE(Eq("foo", "/bar/foo", false));
+}
+
+TEST(FileSpecTest, Match) {
+  auto Match = [](const char *pattern, const char *file) {
+    return FileSpec::Match(PosixSpec(pattern), PosixSpec(file));
+  };
+  EXPECT_TRUE(Match("/foo/bar", "/foo/bar"));
+  EXPECT_FALSE(Match("/foo/bar", "/oof/bar"));
+  EXPECT_FALSE(Match("/foo/bar", "/foo/baz"));
+  EXPECT_FALSE(Match("/foo/bar", "bar"));
+  EXPECT_FALSE(Match("/foo/bar", ""));
+
+  EXPECT_TRUE(Match("bar", "/foo/bar"));
+  EXPECT_FALSE(Match("bar", "/foo/baz"));
+  EXPECT_TRUE(Match("bar", "bar"));
+  EXPECT_FALSE(Match("bar", "baz"));
+  EXPECT_FALSE(Match("bar", ""));
+
+  EXPECT_TRUE(Match("", "/foo/bar"));
+  EXPECT_TRUE(Match("", ""));
+
+}
diff --git a/lldb/unittests/Utility/StreamTest.cpp b/lldb/unittests/Utility/StreamTest.cpp
index 2e2bcb344fcdf..6e42ac2d11f0a 100644
--- a/lldb/unittests/Utility/StreamTest.cpp
+++ b/lldb/unittests/Utility/StreamTest.cpp
@@ -36,6 +36,98 @@ struct BinaryStreamTest : StreamTest {
 };
 }
 
+TEST_F(StreamTest, AddressPrefix) {
+  s.Address(0x1, 1, "foo");
+  EXPECT_EQ("foo0x01", TakeValue());
+}
+
+TEST_F(StreamTest, AddressEmptyPrefix) {
+  s.Address(0x1, 1, nullptr);
+  EXPECT_EQ("0x01", TakeValue());
+  s.Address(0x1, 1, "");
+  EXPECT_EQ("0x01", TakeValue());
+}
+
+TEST_F(StreamTest, AddressSuffix) {
+  s.Address(0x1, 1, nullptr, "foo");
+  EXPECT_EQ("0x01foo", TakeValue());
+}
+
+TEST_F(StreamTest, AddressNoSuffix) {
+  s.Address(0x1, 1, nullptr, nullptr);
+  EXPECT_EQ("0x01", TakeValue());
+  s.Address(0x1, 1, nullptr, "");
+  EXPECT_EQ("0x01", TakeValue());
+}
+
+TEST_F(StreamTest, AddressPrefixAndSuffix) {
+  s.Address(0x1, 1, "foo", "bar");
+  EXPECT_EQ("foo0x01bar", TakeValue());
+}
+
+TEST_F(StreamTest, AddressSize) {
+  s.Address(0x0, 0);
+  EXPECT_EQ("0x0", TakeValue());
+  s.Address(0x1, 0);
+  EXPECT_EQ("0x1", TakeValue());
+
+  s.Address(0x1, 1);
+  EXPECT_EQ("0x01", TakeValue());
+  s.Address(0xf1, 1);
+  EXPECT_EQ("0xf1", TakeValue());
+  s.Address(0xff, 1);
+  EXPECT_EQ("0xff", TakeValue());
+  s.Address(0x100, 1);
+  EXPECT_EQ("0x100", TakeValue());
+
+  s.Address(0xf00, 4);
+  EXPECT_EQ("0x00000f00", TakeValue());
+  s.Address(0x100, 8);
+  EXPECT_EQ("0x0000000000000100", TakeValue());
+  s.Address(0x100, 10);
+  EXPECT_EQ("0x00000000000000000100", TakeValue());
+  s.Address(0x1234, 10);
+  EXPECT_EQ("0x00000000000000001234", TakeValue());
+}
+
+TEST_F(StreamTest, AddressRange) {
+  s.AddressRange(0x100, 0x101, 2);
+  EXPECT_EQ("[0x0100-0x0101)", TakeValue());
+}
+
+TEST_F(StreamTest, AddressRangeEmptyRange) {
+  s.AddressRange(0x100, 0x100, 2);
+  EXPECT_EQ("[0x0100-0x0100)", TakeValue());
+  s.AddressRange(0x0, 0x0, 2);
+  EXPECT_EQ("[0x0000-0x0000)", TakeValue());
+}
+
+TEST_F(StreamTest, AddressRangeInvalidRange) {
+  s.AddressRange(0x100, 0x0FF, 2);
+  EXPECT_EQ("[0x0100-0x00ff)", TakeValue());
+  s.AddressRange(0x100, 0x0, 2);
+  EXPECT_EQ("[0x0100-0x0000)", TakeValue());
+}
+
+TEST_F(StreamTest, AddressRangeSize) {
+  s.AddressRange(0x100, 0x101, 0);
+  EXPECT_EQ("[0x100-0x101)", TakeValue());
+  s.AddressRange(0x100, 0x101, 2);
+  EXPECT_EQ("[0x0100-0x0101)", TakeValue());
+  s.AddressRange(0x100, 0x101, 4);
+  EXPECT_EQ("[0x00000100-0x00000101)", TakeValue());
+
+  s.AddressRange(0x100, 0x101, 4);
+  EXPECT_EQ("[0x00000100-0x00000101)", TakeValue());
+  s.AddressRange(0x1, 0x101, 4);
+  EXPECT_EQ("[0x00000001-0x00000101)", TakeValue());
+  s.AddressRange(0x101, 0x1, 4);
+  EXPECT_EQ("[0x00000101-0x00000001)", TakeValue());
+
+  s.AddressRange(0x1, 0x101, 1);
+  EXPECT_EQ("[0x01-0x101)", TakeValue());
+}
+
 TEST_F(StreamTest, ChangingByteOrder) {
   s.SetByteOrder(lldb::eByteOrderPDP);
   EXPECT_EQ(lldb::eByteOrderPDP, s.GetByteOrder());
@@ -295,24 +387,6 @@ TEST_F(StreamTest, ShiftOperatorStrings) {
   EXPECT_EQ("cstring\nllvm::StringRef\n", TakeValue());
 }
 
-TEST_F(StreamTest, ShiftOperatorInts) {
-  s << std::numeric_limits<int8_t>::max() << " ";
-  s << std::numeric_limits<int16_t>::max() << " ";
-  s << std::numeric_limits<int32_t>::max() << " ";
-  s << std::numeric_limits<int64_t>::max();
-  EXPECT_EQ(40U, s.GetWrittenBytes());
-  EXPECT_EQ("127 32767 2147483647 9223372036854775807", TakeValue());
-}
-
-TEST_F(StreamTest, ShiftOperatorUInts) {
-  s << std::numeric_limits<uint8_t>::max() << " ";
-  s << std::numeric_limits<uint16_t>::max() << " ";
-  s << std::numeric_limits<uint32_t>::max() << " ";
-  s << std::numeric_limits<uint64_t>::max();
-  EXPECT_EQ(33U, s.GetWrittenBytes());
-  EXPECT_EQ("ff ffff ffffffff ffffffffffffffff", TakeValue());
-}
-
 TEST_F(StreamTest, ShiftOperatorPtr) {
   // This test is a bit tricky because pretty much everything related to
   // pointer printing seems to lead to UB or IB. So let's make the most basic
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index b1a51b332ff0d..1479e29b4a3ac 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -530,10 +530,6 @@ option(LLVM_BUILD_EXAMPLES
   "Build the LLVM example programs. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_EXAMPLES "Generate build targets for the LLVM examples" ON)
 
-if(LLVM_BUILD_EXAMPLES)
-  add_definitions(-DBUILD_EXAMPLES)
-endif(LLVM_BUILD_EXAMPLES)
-
 option(LLVM_BUILD_TESTS
   "Build LLVM unit tests. If OFF, just generate build targets." OFF)
 option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
diff --git a/llvm/bindings/go/llvm/dibuilder.go b/llvm/bindings/go/llvm/dibuilder.go
index e845369271602..10e18e14d9895 100644
--- a/llvm/bindings/go/llvm/dibuilder.go
+++ b/llvm/bindings/go/llvm/dibuilder.go
@@ -504,6 +504,7 @@ type DITypedef struct {
 	File    Metadata
 	Line    int
 	Context Metadata
+  AlignInBits uint32
 }
 
 // CreateTypedef creates typedef type debug metadata.
@@ -518,6 +519,7 @@ func (d *DIBuilder) CreateTypedef(t DITypedef) Metadata {
 		t.File.C,
 		C.unsigned(t.Line),
 		t.Context.C,
+    C.uint32_t(t.AlignInBits),
 	)
 	return Metadata{C: result}
 }
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index 028a2cc86bf38..e7e5e5dcf2ff3 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -166,7 +166,6 @@ if(NOT LLVM_USE_SANITIZER MATCHES "Memory.*")
         else()
           include_directories(${LIBXML2_INCLUDE_DIR})
         endif()
-        set(LIBXML2_LIBS "xml2")
       endif()
     endif()
   endif()
diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in
index 7fdca536c1fdb..082393212b674 100644
--- a/llvm/cmake/modules/LLVMConfig.cmake.in
+++ b/llvm/cmake/modules/LLVMConfig.cmake.in
@@ -35,6 +35,8 @@ set(TARGET_TRIPLE "@TARGET_TRIPLE@")
 
 set(LLVM_ABI_BREAKING_CHECKS @LLVM_ABI_BREAKING_CHECKS@)
 
+set(LLVM_ENABLE_EXPENSIVE_CHECKS @LLVM_ENABLE_EXPENSIVE_CHECKS@)
+
 set(LLVM_ENABLE_ASSERTIONS @LLVM_ENABLE_ASSERTIONS@)
 
 set(LLVM_ENABLE_EH @LLVM_ENABLE_EH@)
diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst
index ff70a21b8dfcc..40aeecdf2c81a 100644
--- a/llvm/docs/CommandGuide/lit.rst
+++ b/llvm/docs/CommandGuide/lit.rst
@@ -406,17 +406,38 @@ PRE-DEFINED SUBSTITUTIONS
 :program:`lit` provides various patterns that can be used with the RUN command.
 These are defined in TestRunner.py. The base set of substitutions are:
 
- ========== ==============
-  Macro      Substitution
- ========== ==============
- %s         source path (path to the file currently being run)
- %S         source dir (directory of the file currently being run)
- %p         same as %S
- %{pathsep} path separator
- %t         temporary file name unique to the test
- %T         parent directory of %t (not unique, deprecated, do not use)
- %%         %
- ========== ==============
+ ======================= ==============
+  Macro                   Substitution
+ ======================= ==============
+ %s                      source path (path to the file currently being run)
+ %S                      source dir (directory of the file currently being run)
+ %p                      same as %S
+ %{pathsep}              path separator
+ %t                      temporary file name unique to the test
+ %basename_t             The last path component of %t but without the ``.tmp`` extension
+ %T                      parent directory of %t (not unique, deprecated, do not use)
+ %%                      %
+ %/s                     %s but ``\`` is replaced by ``/``
+ %/S                     %S but ``\`` is replaced by ``/``
+ %/p                     %p but ``\`` is replaced by ``/``
+ %/t                     %t but ``\`` is replaced by ``/``
+ %/T                     %T but ``\`` is replaced by ``/``
+ %{/s:regex_replacement} %/s but escaped for use in the replacement of a ``s@@@`` command in sed
+ %{/S:regex_replacement} %/S but escaped for use in the replacement of a ``s@@@`` command in sed
+ %{/p:regex_replacement} %/p but escaped for use in the replacement of a ``s@@@`` command in sed
+ %{/t:regex_replacement} %/t but escaped for use in the replacement of a ``s@@@`` command in sed
+ %{/T:regex_replacement} %/T but escaped for use in the replacement of a ``s@@@`` command in sed
+ %:s                     On Windows, %/s but a ``:`` is removed if its the second character.
+                         Otherwise, %s but with a single leading ``/`` removed.
+ %:S                     On Windows, %/S but a ``:`` is removed if its the second character.
+                         Otherwise, %S but with a single leading ``/`` removed.
+ %:p                     On Windows, %/p but a ``:`` is removed if its the second character.
+                         Otherwise, %p but with a single leading ``/`` removed.
+ %:t                     On Windows, %/t but a ``:`` is removed if its the second character.
+                         Otherwise, %t but with a single leading ``/`` removed.
+ %:T                     On Windows, %/T but a ``:`` is removed if its the second character.
+                         Otherwise, %T but with a single leading ``/`` removed.
+ ======================= ==============
 
 Other substitutions are provided that are variations on this base set and
 further substitution patterns can be defined by each test module. See the
diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst
index 67adc45e1dcc0..2ad0d9080e12d 100644
--- a/llvm/docs/Contributing.rst
+++ b/llvm/docs/Contributing.rst
@@ -45,7 +45,6 @@ you are interested in working on any of these projects, please send a mail to
 the `LLVM Developer's mailing list`_, so that we know the project is being
 worked on.
 
-
 How to Submit a Patch
 =====================
 Once you have a patch ready, it is time to submit it. The patch should:
@@ -55,6 +54,35 @@ Once you have a patch ready, it is time to submit it. The patch should:
 * not contain any unrelated changes
 * be an isolated change. Independent changes should be submitted as separate patches as this makes reviewing easier.
 
+.. _format patches:
+
+Before sending a patch for review, please also try to ensure it is
+formatted properly. We use ``clang-format`` for this, which has git integration
+through the ``git-clang-format`` script. On some systems, it may already be
+installed (or be installable via your package manager). If so, you can simply
+run it -- the following command will format only the code changed in the most
+recent commit:
+
+.. code-block:: console
+
+  % git clang-format HEAD~1
+
+Note that this modifies the files, but doesn't commit them -- you'll likely want
+to run
+
+.. code-block:: console
+
+  % git commit --amend -a
+
+in order to update the last commit with all pending changes.
+
+.. note::
+  If you don't already have ``clang-format`` or ``git clang-format`` installed
+  on your system, the ``clang-format`` binary will be built alongside clang, and
+  the git integration can be run from
+  ``clang/tools/clang-format/git-clang-format``.
+
+
 To get a patch accepted, it has to be reviewed by the LLVM community. This can
 be done using `LLVM's Phabricator`_ or the llvm-commits mailing list.
 Please  follow :ref:`Phabricator#requesting-a-review-via-the-web-interface <phabricator-request-review-web>`
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 52f8e392ce86d..aa37e00b50563 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3334,6 +3334,9 @@ Poison value behavior is defined in terms of value *dependence*:
    be different if the terminator had transferred control to a different
    successor.
 -  Dependence is transitive.
+-  Vector elements may be independently poisoned. Therefore, transforms
+   on instructions such as shufflevector must be careful to propagate
+   poison across values or elements only as allowed by the original code.
 
 An instruction that *depends* on a poison value, produces a poison value
 itself. A poison value may be relaxed into an
@@ -8448,10 +8451,13 @@ Semantics:
 The elements of the two input vectors are numbered from left to right
 across both of the vectors. The shuffle mask operand specifies, for each
 element of the result vector, which element of the two input vectors the
-result element gets. If the shuffle mask is undef, the result vector is
-undef. If any element of the mask operand is undef, that element of the
-result is undef. If the shuffle mask selects an undef element from one
-of the input vectors, the resulting element is undef.
+result element gets.
+
+If the shuffle mask is undef, the result vector is undef. If any element
+of the mask operand is undef, that element of the result is undef. If the
+shuffle mask selects an undef element from one of the input vectors, the
+resulting element is undef. An undef mask element prevents a poisoned
+vector element from propagating.
 
 For scalable vectors, the only valid mask values at present are
 ``zeroinitializer`` and ``undef``, since we cannot write all indices as
diff --git a/llvm/docs/Phabricator.rst b/llvm/docs/Phabricator.rst
index ca23ab3f13078..7de8dc1e6a0bf 100644
--- a/llvm/docs/Phabricator.rst
+++ b/llvm/docs/Phabricator.rst
@@ -62,6 +62,9 @@ to upload your patch):
 * ``git format-patch -U999999 @{u}``
 * ``svn diff --diff-cmd=diff -x -U999999``
 
+Before uploading your patch, please make sure it is formatted properly, as
+described in :ref:`How to Submit a Patch <format patches>`.
+
 To upload a new patch:
 
 * Click *Differential*.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index e85a85053fb96..c27f3bc8b692e 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -166,6 +166,16 @@ Changes to the OCaml bindings
 
 Changes to the C API
 --------------------
+* C DebugInfo API ``LLVMDIBuilderCreateTypedef`` is updated to include an extra
+argument ``AlignInBits``, to facilitate / propagate specified Alignment information
+present in a ``typedef`` to Debug information in LLVM IR.
+
+
+Changes to the Go bindings
+--------------------------
+* Go DebugInfo API ``CreateTypedef`` is updated to include an extra argument ``AlignInBits``,
+to facilitate / propagate specified Alignment information present in a ``typedef``
+to Debug information in LLVM IR.
 
 
 Changes to the DAG infrastructure
diff --git a/llvm/examples/CMakeLists.txt b/llvm/examples/CMakeLists.txt
index 1fbcbf793b2da..ad99d4c7e3127 100644
--- a/llvm/examples/CMakeLists.txt
+++ b/llvm/examples/CMakeLists.txt
@@ -2,7 +2,6 @@ add_subdirectory(BrainF)
 add_subdirectory(Fibonacci)
 add_subdirectory(HowToUseJIT)
 add_subdirectory(HowToUseLLJIT)
-add_subdirectory(IRTransforms)
 add_subdirectory(LLJITExamples)
 add_subdirectory(Kaleidoscope)
 add_subdirectory(ModuleMaker)
diff --git a/llvm/examples/IRTransforms/CMakeLists.txt b/llvm/examples/IRTransforms/CMakeLists.txt
deleted file mode 100644
index 1c3185eed5ff2..0000000000000
--- a/llvm/examples/IRTransforms/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  Analysis
-  Core
-  Support
-  )
-
-add_llvm_library(ExampleIRTransforms
-  InitializePasses.cpp
-  SimplifyCFG.cpp
-
-  ADDITIONAL_HEADER_DIRS
-
-  DEPENDS
-  intrinsics_gen
-  )
diff --git a/llvm/examples/IRTransforms/InitializePasses.cpp b/llvm/examples/IRTransforms/InitializePasses.cpp
deleted file mode 100644
index 125180715cd41..0000000000000
--- a/llvm/examples/IRTransforms/InitializePasses.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- InitializePasses.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements implements the initialization hook for the example
-// transforms.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InitializePasses.h"
-#include "llvm/PassRegistry.h"
-
-using namespace llvm;
-
-void initializeExampleIRTransforms(PassRegistry &Registry) {
-  initializeSimplifyCFGLegacyPassPass(Registry);
-}
diff --git a/llvm/examples/IRTransforms/InitializePasses.h b/llvm/examples/IRTransforms/InitializePasses.h
deleted file mode 100644
index 8b6673d518e63..0000000000000
--- a/llvm/examples/IRTransforms/InitializePasses.h
+++ /dev/null
@@ -1,22 +0,0 @@
-//===- InitializePasses.h - -------------------------------------*- C++ -*-===//
-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXAMPLES_IRTRANSFORMS_INITIALIZEPASSES__H
-#define LLVM_EXAMPLES_IRTRANSFORMS_INITIALIZEPASSES__H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-void initializeExampleIRTransforms(PassRegistry &Registry);
-void initializeSimplifyCFGLegacyPassPass(PassRegistry &Registry);
-
-} // end namespace llvm
-
-#endif
diff --git a/llvm/examples/IRTransforms/SimplifyCFG.cpp b/llvm/examples/IRTransforms/SimplifyCFG.cpp
deleted file mode 100644
index 10658c9f09590..0000000000000
--- a/llvm/examples/IRTransforms/SimplifyCFG.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-//===- SimplifyCFG.cpp ----------------------------------------------------===//
-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the control flow graph (CFG) simplifications
-// presented as part of the 'Getting Started With LLVM: Basics' tutorial at the
-// US LLVM Developers Meeting 2019. It also contains additional material.
-//
-// The current file contains three different CFG simplifications. There are
-// multiple versions of each implementation (e.g. _v1 and _v2), which implement
-// additional functionality (e.g. preserving analysis like the DominatorTree) or
-// use additional utilities to simplify the code (e.g. LLVM's PatternMatch.h).
-// The available simplifications are:
-//  1. Trivially Dead block Removal (removeDeadBlocks_v[1,2]).
-//     This simplifications removes all blocks without predecessors in the CFG
-//     from a function.
-//  2. Conditional Branch Elimination (eliminateCondBranches_v[1,2,3])
-//     This simplification replaces conditional branches with constant integer
-//     conditions with unconditional branches.
-//  3. Single Predecessor Block Merging (mergeIntoSinglePredecessor_v[1,2])
-//     This simplification merges blocks with a single predecessor into the
-//     predecessor, if that block has a single successor.
-//
-// TODOs
-//  * Hook up pass to the new pass manager.
-//  * Preserve LoopInfo.
-//  * Add fixed point iteration to delete all dead blocks
-//  * Add implementation using reachability to discover dead blocks.
-//===----------------------------------------------------------------------===//
-
-#include "SimplifyCFG.h"
-#include "InitializePasses.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-enum TutorialVersion { V1, V2, V3 };
-static cl::opt<TutorialVersion>
-    Version("tut-simplifycfg-version", cl::desc("Select tutorial version"),
-            cl::Hidden, cl::ValueOptional, cl::init(V1),
-            cl::values(clEnumValN(V1, "v1", "version 1"),
-                       clEnumValN(V2, "v2", "version 2"),
-                       clEnumValN(V3, "v3", "version 3"),
-                       // Sentinel value for unspecified option.
-                       clEnumValN(V3, "", "")));
-
-#define DEBUG_TYPE "tut-simplifycfg"
-
-// Remove trivially dead blocks. First version, not preserving the
-// DominatorTree.
-static bool removeDeadBlocks_v1(Function &F) {
-  bool Changed = false;
-
-  // Remove trivially dead blocks.
-  for (BasicBlock &BB : make_early_inc_range(F)) {
-    // Skip blocks we know to not be trivially dead. We know a block is
-    // guaranteed to be dead, iff it is neither the entry block nor
-    // has any predecessors.
-    if (&F.getEntryBlock() == &BB || !pred_empty(&BB))
-      continue;
-
-    // Notify successors of BB that BB is going to be removed. This removes
-    // incoming values from BB from PHIs in the successors. Note that this will
-    // not actually remove BB from the predecessor lists of its successors.
-    for (BasicBlock *Succ : successors(&BB))
-      Succ->removePredecessor(&BB);
-    // TODO: Find a better place to put such small variations.
-    // Alternatively, we can update the PHI nodes manually:
-    // for (PHINode &PN : make_early_inc_range(Succ->phis()))
-    //  PN.removeIncomingValue(&BB);
-
-    // Replace all instructions in BB with an undef constant. The block is
-    // unreachable, so the results of the instructions should never get used.
-    while (!BB.empty()) {
-      Instruction &I = BB.back();
-      I.replaceAllUsesWith(UndefValue::get(I.getType()));
-      I.eraseFromParent();
-    }
-
-    // Finally remove the basic block.
-    BB.eraseFromParent();
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-// Remove trivially dead blocks. This is the second version and preserves the
-// dominator tree.
-static bool removeDeadBlocks_v2(Function &F, DominatorTree &DT) {
-  bool Changed = false;
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
-
-  // Remove trivially dead blocks.
-  for (BasicBlock &BB : make_early_inc_range(F)) {
-    // Skip blocks we know to not be trivially dead. We know a block is
-    // guaranteed to be dead, iff it is neither the entry block nor
-    // has any predecessors.
-    if (&F.getEntryBlock() == &BB || !pred_empty(&BB))
-      continue;
-
-    // Notify successors of BB that BB is going to be removed. This removes
-    // incoming values from BB from PHIs in the successors. Note that this will
-    // not actually remove BB from the predecessor lists of its successors.
-    for (BasicBlock *Succ : successors(&BB)) {
-      Succ->removePredecessor(&BB);
-
-      // Collect updates that need to be applied to the dominator tree.
-      DTUpdates.push_back({DominatorTree::Delete, &BB, Succ});
-    }
-
-    // Remove BB via the DomTreeUpdater. DomTreeUpdater::deleteBB conveniently
-    // removes the instructions in BB as well.
-    DTU.deleteBB(&BB);
-    Changed = true;
-  }
-
-  // Apply updates permissively, to remove duplicates.
-  DTU.applyUpdatesPermissive(DTUpdates);
-
-  return Changed;
-}
-
-// Eliminate branches with constant conditionals. This is the first version,
-// which *does not* preserve the dominator tree.
-static bool eliminateCondBranches_v1(Function &F) {
-  bool Changed = false;
-
-  // Eliminate branches with constant conditionals.
-  for (BasicBlock &BB : F) {
-    // Skip blocks without conditional branches as terminators.
-    BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!BI || !BI->isConditional())
-      continue;
-
-    // Skip blocks with conditional branches without ConstantInt conditions.
-    ConstantInt *CI = dyn_cast<ConstantInt>(BI->getCondition());
-    if (!CI)
-      continue;
-
-    // We use the branch condition (CI), to select the successor we remove:
-    // if CI == 1 (true), we remove the second successor, otherwise the first.
-    BasicBlock *RemovedSucc = BI->getSuccessor(CI->isOne());
-    // Tell RemovedSucc we will remove BB from its predecessors.
-    RemovedSucc->removePredecessor(&BB);
-
-    // Replace the conditional branch with an unconditional one, by creating
-    // a new unconditional branch to the selected successor and removing the
-    // conditional one.
-    BranchInst::Create(BI->getSuccessor(CI->isZero()), BI);
-    BI->eraseFromParent();
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-// Eliminate branches with constant conditionals. This is the second
-// version, which *does* preserve the dominator tree.
-static bool eliminateCondBranches_v2(Function &F, DominatorTree &DT) {
-  bool Changed = false;
-
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
-  // Eliminate branches with constant conditionals.
-  for (BasicBlock &BB : F) {
-    // Skip blocks without conditional branches as terminators.
-    BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!BI || !BI->isConditional())
-      continue;
-
-    // Skip blocks with conditional branches without ConstantInt conditions.
-    ConstantInt *CI = dyn_cast<ConstantInt>(BI->getCondition());
-    if (!CI)
-      continue;
-
-    // We use the branch condition (CI), to select the successor we remove:
-    // if CI == 1 (true), we remove the second successor, otherwise the first.
-    BasicBlock *RemovedSucc = BI->getSuccessor(CI->isOne());
-    // Tell RemovedSucc we will remove BB from its predecessors.
-    RemovedSucc->removePredecessor(&BB);
-
-    // Replace the conditional branch with an unconditional one, by creating
-    // a new unconditional branch to the selected successor and removing the
-    // conditional one.
-    BranchInst *NewBranch =
-        BranchInst::Create(BI->getSuccessor(CI->isZero()), BI);
-    BI->eraseFromParent();
-
-    // Delete the edge between BB and RemovedSucc in the DominatorTree, iff
-    // the conditional branch did not use RemovedSucc as both the true and false
-    // branches.
-    if (NewBranch->getSuccessor(0) != RemovedSucc)
-      DTUpdates.push_back({DominatorTree::Delete, &BB, RemovedSucc});
-    Changed = true;
-  }
-
-  // Apply updates permissively, to remove duplicates.
-  DTU.applyUpdatesPermissive(DTUpdates);
-
-  return Changed;
-}
-
-// Eliminate branches with constant conditionals. This is the third
-// version, which uses PatternMatch.h.
-static bool eliminateCondBranches_v3(Function &F, DominatorTree &DT) {
-  bool Changed = false;
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
-
-  // Eliminate branches with constant conditionals.
-  for (BasicBlock &BB : F) {
-    ConstantInt *CI = nullptr;
-    BasicBlock *TakenSucc, *RemovedSucc;
-    // Check if the terminator is a conditional branch, with constant integer
-    // condition and also capture the successor blocks as TakenSucc and
-    // RemovedSucc.
-    if (!match(BB.getTerminator(),
-               m_Br(m_ConstantInt(CI), m_BasicBlock(TakenSucc),
-                    m_BasicBlock(RemovedSucc))))
-      continue;
-
-    // If the condition is false, swap TakenSucc and RemovedSucc.
-    if (CI->isZero())
-      std::swap(TakenSucc, RemovedSucc);
-
-    // Tell RemovedSucc we will remove BB from its predecessors.
-    RemovedSucc->removePredecessor(&BB);
-
-    // Replace the conditional branch with an unconditional one, by creating
-    // a new unconditional branch to the selected successor and removing the
-    // conditional one.
-
-    BranchInst *NewBranch = BranchInst::Create(TakenSucc, BB.getTerminator());
-    BB.getTerminator()->eraseFromParent();
-
-    // Delete the edge between BB and RemovedSucc in the DominatorTree, iff
-    // the conditional branch did not use RemovedSucc as both the true and false
-    // branches.
-    if (NewBranch->getSuccessor(0) != RemovedSucc)
-      DTUpdates.push_back({DominatorTree::Delete, &BB, RemovedSucc});
-    Changed = true;
-  }
-
-  // Apply updates permissively, to remove duplicates.
-  DTU.applyUpdatesPermissive(DTUpdates);
-  return Changed;
-}
-
-// Merge basic blocks into their single predecessor, if their predecessor has a
-// single successor. This is the first version and does not preserve the
-// DominatorTree.
-static bool mergeIntoSinglePredecessor_v1(Function &F) {
-  bool Changed = false;
-
-  // Merge blocks with single predecessors.
-  for (BasicBlock &BB : make_early_inc_range(F)) {
-    BasicBlock *Pred = BB.getSinglePredecessor();
-    // Make sure  BB has a single predecessor Pred and BB is the single
-    // successor of Pred.
-    if (!Pred || Pred->getSingleSuccessor() != &BB)
-      continue;
-
-    // Do not try to merge self loops. That can happen in dead blocks.
-    if (Pred == &BB)
-      continue;
-
-    // Need to replace it before nuking the branch.
-    BB.replaceAllUsesWith(Pred);
-    // PHI nodes in BB can only have a single incoming value. Remove them.
-    for (PHINode &PN : make_early_inc_range(BB.phis())) {
-      PN.replaceAllUsesWith(PN.getIncomingValue(0));
-      PN.eraseFromParent();
-    }
-    // Move all instructions from BB to Pred.
-    for (Instruction &I : make_early_inc_range(BB))
-      I.moveBefore(Pred->getTerminator());
-
-    // Remove the Pred's terminator (which jumped to BB). BB's terminator
-    // will become Pred's terminator.
-    Pred->getTerminator()->eraseFromParent();
-    BB.eraseFromParent();
-
-    Changed = true;
-  }
-
-  return Changed;
-}
-
-// Merge basic blocks into their single predecessor, if their predecessor has a
-// single successor. This is the second version and does preserve the
-// DominatorTree.
-static bool mergeIntoSinglePredecessor_v2(Function &F, DominatorTree &DT) {
-  bool Changed = false;
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
-
-  // Merge blocks with single predecessors.
-  for (BasicBlock &BB : make_early_inc_range(F)) {
-    BasicBlock *Pred = BB.getSinglePredecessor();
-    // Make sure  BB has a single predecessor Pred and BB is the single
-    // successor of Pred.
-    if (!Pred || Pred->getSingleSuccessor() != &BB)
-      continue;
-
-    // Do not try to merge self loops. That can happen in dead blocks.
-    if (Pred == &BB)
-      continue;
-
-    // Tell DTU about the changes to the CFG: All edges from BB to its
-    // successors get removed and we add edges between Pred and BB's successors.
-    for (BasicBlock *Succ : successors(&BB)) {
-      DTUpdates.push_back({DominatorTree::Delete, &BB, Succ});
-      DTUpdates.push_back({DominatorTree::Insert, Pred, Succ});
-    }
-    // Also remove the edge between Pred and BB.
-    DTUpdates.push_back({DominatorTree::Delete, Pred, &BB});
-
-    // Need to replace it before nuking the branch.
-    BB.replaceAllUsesWith(Pred);
-    // PHI nodes in BB can only have a single incoming value. Remove them.
-    for (PHINode &PN : make_early_inc_range(BB.phis())) {
-      PN.replaceAllUsesWith(PN.getIncomingValue(0));
-      PN.eraseFromParent();
-    }
-    // Move all instructions from BB to Pred.
-    for (Instruction &I : make_early_inc_range(BB))
-      I.moveBefore(Pred->getTerminator());
-
-    // Remove the Pred's terminator (which jumped to BB). BB's terminator
-    // will become Pred's terminator.
-    Pred->getTerminator()->eraseFromParent();
-    DTU.deleteBB(&BB);
-
-    Changed = true;
-  }
-
-  // Apply updates permissively, to remove duplicates.
-  DTU.applyUpdatesPermissive(DTUpdates);
-  return Changed;
-}
-
-static bool doSimplify_v1(Function &F) {
-  return eliminateCondBranches_v1(F) & mergeIntoSinglePredecessor_v1(F) &
-         removeDeadBlocks_v1(F);
-}
-
-static bool doSimplify_v2(Function &F, DominatorTree &DT) {
-  return eliminateCondBranches_v2(F, DT) &
-         mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT);
-}
-
-static bool doSimplify_v3(Function &F, DominatorTree &DT) {
-  return eliminateCondBranches_v3(F, DT) &
-         mergeIntoSinglePredecessor_v2(F, DT) & removeDeadBlocks_v2(F, DT);
-}
-
-namespace {
-struct SimplifyCFGLegacyPass : public FunctionPass {
-  static char ID;
-  SimplifyCFGLegacyPass() : FunctionPass(ID) {
-    initializeSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    // Version 1 of the implementation does not preserve the dominator tree.
-    if (Version != V1)
-      AU.addPreserved<DominatorTreeWrapperPass>();
-
-    FunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnFunction(Function &F) override {
-    if (skipFunction(F))
-      return false;
-
-    switch (Version) {
-    case V1:
-      return doSimplify_v1(F);
-    case V2: {
-      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      return doSimplify_v2(F, DT);
-    }
-    case V3: {
-      auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      return doSimplify_v3(F, DT);
-    }
-    }
-
-    llvm_unreachable("Unsupported version");
-  }
-};
-} // namespace
-
-char SimplifyCFGLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(SimplifyCFGLegacyPass, DEBUG_TYPE,
-                      "Tutorial CFG simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SimplifyCFGLegacyPass, DEBUG_TYPE,
-                    "Tutorial CFG simplifications", false, false)
diff --git a/llvm/examples/IRTransforms/SimplifyCFG.h b/llvm/examples/IRTransforms/SimplifyCFG.h
deleted file mode 100644
index 09328afb01d36..0000000000000
--- a/llvm/examples/IRTransforms/SimplifyCFG.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- SimplifyCFG.h - Tutorial SimplifyCFG ---------------------*- C++ -*-===//
-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXAMPLES_IRTRANSFORMS_SIMPLIFYCFG__H
-#define LLVM_EXAMPLES_IRTRANSFORMS_SIMPLIFYCFG__H
-
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-
-namespace llvm {
-
-FunctionPass *createSimplifyCFGPass();
-
-void initializeSimplifyCFGLegacyPassPass(PassRegistry &);
-
-} // end namespace llvm
-
-#endif // LLVM_EXAMPLES_IRTRANSFORMS_SIMPLIFYCFG__H
diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index a7fa3afc470cd..020b72c23947a 100644
--- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -39,14 +39,17 @@ class KaleidoscopeJIT {
   MangleAndInterner Mangle;
   ThreadSafeContext Ctx;
 
+  JITDylib &MainJD;
+
 public:
   KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
       : ObjectLayer(ES,
                     []() { return std::make_unique<SectionMemoryManager>(); }),
         CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
         DL(std::move(DL)), Mangle(ES, this->DL),
-        Ctx(std::make_unique<LLVMContext>()) {
-    ES.getMainJITDylib().addGenerator(
+        Ctx(std::make_unique<LLVMContext>()),
+        MainJD(ES.createJITDylib("<main>")) {
+    MainJD.addGenerator(
         cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
             DL.getGlobalPrefix())));
   }
@@ -69,12 +72,11 @@ class KaleidoscopeJIT {
   LLVMContext &getContext() { return *Ctx.getContext(); }
 
   Error addModule(std::unique_ptr<Module> M) {
-    return CompileLayer.add(ES.getMainJITDylib(),
-                            ThreadSafeModule(std::move(M), Ctx));
+    return CompileLayer.add(MainJD, ThreadSafeModule(std::move(M), Ctx));
   }
 
   Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
-    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
+    return ES.lookup({&MainJD}, Mangle(Name.str()));
   }
 };
 
diff --git a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index e9999efd37a51..8037e58ae4f72 100644
--- a/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/llvm/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -45,15 +45,17 @@ class KaleidoscopeJIT {
   MangleAndInterner Mangle;
   ThreadSafeContext Ctx;
 
+  JITDylib &MainJD;
+
 public:
   KaleidoscopeJIT(JITTargetMachineBuilder JTMB, DataLayout DL)
       : ObjectLayer(ES,
                     []() { return std::make_unique<SectionMemoryManager>(); }),
         CompileLayer(ES, ObjectLayer, ConcurrentIRCompiler(std::move(JTMB))),
-        OptimizeLayer(ES, CompileLayer, optimizeModule),
-        DL(std::move(DL)), Mangle(ES, this->DL),
-        Ctx(std::make_unique<LLVMContext>()) {
-    ES.getMainJITDylib().addGenerator(
+        OptimizeLayer(ES, CompileLayer, optimizeModule), DL(std::move(DL)),
+        Mangle(ES, this->DL), Ctx(std::make_unique<LLVMContext>()),
+        MainJD(ES.createJITDylib("<main>")) {
+    MainJD.addGenerator(
         cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
             DL.getGlobalPrefix())));
   }
@@ -76,12 +78,11 @@ class KaleidoscopeJIT {
   }
 
   Error addModule(std::unique_ptr<Module> M) {
-    return OptimizeLayer.add(ES.getMainJITDylib(),
-                             ThreadSafeModule(std::move(M), Ctx));
+    return OptimizeLayer.add(MainJD, ThreadSafeModule(std::move(M), Ctx));
   }
 
   Expected<JITEvaluatedSymbol> lookup(StringRef Name) {
-    return ES.lookup({&ES.getMainJITDylib()}, Mangle(Name.str()));
+    return ES.lookup({&MainJD}, Mangle(Name.str()));
   }
 
 private:
diff --git a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
index 1fd1fc92a73f9..f4cfb7403dbd7 100644
--- a/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
+++ b/llvm/examples/SpeculativeJIT/SpeculativeJIT.cpp
@@ -76,12 +76,12 @@ class SpeculativeJIT {
 
   ExecutionSession &getES() { return *ES; }
 
-  Error addModule(JITDylib &JD, ThreadSafeModule TSM) {
-    return CODLayer.add(JD, std::move(TSM));
+  Error addModule(ThreadSafeModule TSM) {
+    return CODLayer.add(MainJD, std::move(TSM));
   }
 
   Expected<JITEvaluatedSymbol> lookup(StringRef UnmangledName) {
-    return ES->lookup({&ES->getMainJITDylib()}, Mangle(UnmangledName));
+    return ES->lookup({&MainJD}, Mangle(UnmangledName));
   }
 
   ~SpeculativeJIT() { CompileThreads.wait(); }
@@ -101,15 +101,15 @@ class SpeculativeJIT {
       std::unique_ptr<LazyCallThroughManager> LCTMgr,
       IndirectStubsManagerBuilderFunction ISMBuilder,
       std::unique_ptr<DynamicLibrarySearchGenerator> ProcessSymbolsGenerator)
-      : ES(std::move(ES)), DL(std::move(DL)), LCTMgr(std::move(LCTMgr)),
+      : ES(std::move(ES)), DL(std::move(DL)),
+        MainJD(this->ES->createJITDylib("<main>")), LCTMgr(std::move(LCTMgr)),
         CompileLayer(*this->ES, ObjLayer,
                      ConcurrentIRCompiler(std::move(JTMB))),
         S(Imps, *this->ES),
         SpeculateLayer(*this->ES, CompileLayer, S, Mangle, BlockFreqQuery()),
         CODLayer(*this->ES, SpeculateLayer, *this->LCTMgr,
                  std::move(ISMBuilder)) {
-    this->ES->getMainJITDylib().addGenerator(
-        std::move(ProcessSymbolsGenerator));
+    MainJD.addGenerator(std::move(ProcessSymbolsGenerator));
     this->CODLayer.setImplMap(&Imps);
     this->ES->setDispatchMaterialization(
 
@@ -119,9 +119,9 @@ class SpeculativeJIT {
           auto Work = [SharedMU, &JD]() { SharedMU->doMaterialize(JD); };
           CompileThreads.async(std::move(Work));
         });
-    ExitOnErr(S.addSpeculationRuntime(this->ES->getMainJITDylib(), Mangle));
+    ExitOnErr(S.addSpeculationRuntime(MainJD, Mangle));
     LocalCXXRuntimeOverrides CXXRuntimeoverrides;
-    ExitOnErr(CXXRuntimeoverrides.enable(this->ES->getMainJITDylib(), Mangle));
+    ExitOnErr(CXXRuntimeoverrides.enable(MainJD, Mangle));
   }
 
   static std::unique_ptr<SectionMemoryManager> createMemMgr() {
@@ -133,6 +133,8 @@ class SpeculativeJIT {
   MangleAndInterner Mangle{*ES, DL};
   ThreadPool CompileThreads{NumThreads};
 
+  JITDylib &MainJD;
+
   Triple TT;
   std::unique_ptr<LazyCallThroughManager> LCTMgr;
   IRCompileLayer CompileLayer;
@@ -172,24 +174,14 @@ int main(int argc, char *argv[]) {
       return 1;
     }
 
-    ExitOnErr(SJ->addModule(SJ->getES().getMainJITDylib(),
-                            ThreadSafeModule(std::move(M), std::move(Ctx))));
+    ExitOnErr(SJ->addModule(ThreadSafeModule(std::move(M), std::move(Ctx))));
   }
 
-  // Build an argv array for the JIT'd main.
-  std::vector<const char *> ArgV;
-  ArgV.push_back(argv[0]);
-  for (const auto &InputArg : InputArgv)
-    ArgV.push_back(InputArg.data());
-  ArgV.push_back(nullptr);
-
-  // Look up the JIT'd main, cast it to a function pointer, then call it.
-
   auto MainSym = ExitOnErr(SJ->lookup("main"));
-  int (*Main)(int, const char *[]) =
-      (int (*)(int, const char *[]))MainSym.getAddress();
+  auto Main =
+      jitTargetAddressToFunction<int (*)(int, char *[])>(MainSym.getAddress());
 
-  Main(ArgV.size() - 1, ArgV.data());
+  return runAsMain(Main, InputArgv, StringRef(InputFiles.front()));
 
   return 0;
 }
diff --git a/llvm/include/llvm-c/DebugInfo.h b/llvm/include/llvm-c/DebugInfo.h
index ab60b88a31f46..731f32741e191 100644
--- a/llvm/include/llvm-c/DebugInfo.h
+++ b/llvm/include/llvm-c/DebugInfo.h
@@ -874,7 +874,7 @@ LLVMMetadataRef
 LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Builder, LLVMMetadataRef Type,
                            const char *Name, size_t NameLen,
                            LLVMMetadataRef File, unsigned LineNo,
-                           LLVMMetadataRef Scope);
+                           LLVMMetadataRef Scope, uint32_t AlignInBits);
 
 /**
  * Create debugging information entry to establish inheritance relationship
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 70fc19e82b3c7..afeed67e3f9e8 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -851,6 +851,9 @@ class APFloat : public APFloatBase {
   APFloat(const fltSemantics &Semantics) : U(Semantics) {}
   APFloat(const fltSemantics &Semantics, StringRef S);
   APFloat(const fltSemantics &Semantics, integerPart I) : U(Semantics, I) {}
+  template <typename T, typename = typename std::enable_if<
+                            std::is_floating_point<T>::value>::type>
+  APFloat(const fltSemantics &Semantics, T V) = delete;
   // TODO: Remove this constructor. This isn't faster than the first one.
   APFloat(const fltSemantics &Semantics, uninitializedTag)
       : U(Semantics, uninitialized) {}
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 796110f753bc0..0791a6d686a3f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -389,6 +389,11 @@ class LLVM_NODISCARD APInt {
   /// \returns true if this APInt is positive.
   bool isStrictlyPositive() const { return isNonNegative() && !isNullValue(); }
 
+  /// Determine if this APInt Value is non-positive (<= 0).
+  ///
+  /// \returns true if this APInt is non-positive.
+  bool isNonPositive() const { return !isStrictlyPositive(); }
+
   /// Determine if all bits are set
   ///
   /// This checks to see if the value has all bits of the APInt are set or not.
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index c8965936fb9c1..41d6c23b8d0d9 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -34,6 +34,7 @@ namespace llvm {
 class Function;
 class LoopInfo;
 class raw_ostream;
+class PostDominatorTree;
 class TargetLibraryInfo;
 class Value;
 
@@ -187,8 +188,10 @@ class BranchProbabilityInfo {
   /// Track the set of blocks that always lead to a cold call.
   SmallPtrSet<const BasicBlock *, 16> PostDominatedByColdCall;
 
-  void updatePostDominatedByUnreachable(const BasicBlock *BB);
-  void updatePostDominatedByColdCall(const BasicBlock *BB);
+  void computePostDominatedByUnreachable(const Function &F,
+                                         PostDominatorTree *PDT);
+  void computePostDominatedByColdCall(const Function &F,
+                                      PostDominatorTree *PDT);
   bool calcUnreachableHeuristics(const BasicBlock *BB);
   bool calcMetadataWeights(const BasicBlock *BB);
   bool calcColdCallHeuristics(const BasicBlock *BB);
diff --git a/llvm/include/llvm/Analysis/DDG.h b/llvm/include/llvm/Analysis/DDG.h
index 7a572afccd67d..22df60efd84ec 100644
--- a/llvm/include/llvm/Analysis/DDG.h
+++ b/llvm/include/llvm/Analysis/DDG.h
@@ -300,6 +300,7 @@ using DDGInfo = DependenceGraphInfo<DDGNode>;
 
 /// Data Dependency Graph
 class DataDependenceGraph : public DDGBase, public DDGInfo {
+  friend AbstractDependenceGraphBuilder<DataDependenceGraph>;
   friend class DDGBuilder;
 
 public:
@@ -311,7 +312,7 @@ class DataDependenceGraph : public DDGBase, public DDGInfo {
   DataDependenceGraph(DataDependenceGraph &&G)
       : DDGBase(std::move(G)), DDGInfo(std::move(G)) {}
   DataDependenceGraph(Function &F, DependenceInfo &DI);
-  DataDependenceGraph(const Loop &L, DependenceInfo &DI);
+  DataDependenceGraph(Loop &L, LoopInfo &LI, DependenceInfo &DI);
   ~DataDependenceGraph();
 
   /// If node \p N belongs to a pi-block return a pointer to the pi-block,
@@ -381,6 +382,12 @@ class DDGBuilder : public AbstractDependenceGraphBuilder<DataDependenceGraph> {
     return *E;
   }
 
+  const NodeListType &getNodesInPiBlock(const DDGNode &N) final override {
+    auto *PiNode = dyn_cast<const PiBlockDDGNode>(&N);
+    assert(PiNode && "Expected a pi-block node.");
+    return PiNode->getNodes();
+  }
+
   bool shouldCreatePiBlocks() const final override;
 };
 
diff --git a/llvm/include/llvm/Analysis/DependenceGraphBuilder.h b/llvm/include/llvm/Analysis/DependenceGraphBuilder.h
index 876ccbc5c9a04..99465ef39a076 100644
--- a/llvm/include/llvm/Analysis/DependenceGraphBuilder.h
+++ b/llvm/include/llvm/Analysis/DependenceGraphBuilder.h
@@ -59,6 +59,7 @@ template <class GraphType> class AbstractDependenceGraphBuilder {
     createMemoryDependencyEdges();
     createAndConnectRootNode();
     createPiBlocks();
+    sortNodesTopologically();
   }
 
   /// Create fine grained nodes. These are typically atomic nodes that
@@ -84,6 +85,9 @@ template <class GraphType> class AbstractDependenceGraphBuilder {
   /// the dependence graph into an acyclic graph.
   void createPiBlocks();
 
+  /// Topologically sort the graph nodes.
+  void sortNodesTopologically();
+
 protected:
   /// Create the root node of the graph.
   virtual NodeType &createRootNode() = 0;
@@ -104,6 +108,10 @@ template <class GraphType> class AbstractDependenceGraphBuilder {
   /// Create a rooted edge going from \p Src to \p Tgt .
   virtual EdgeType &createRootedEdge(NodeType &Src, NodeType &Tgt) = 0;
 
+  /// Given a pi-block node, return a vector of all the nodes contained within
+  /// it.
+  virtual const NodeListType &getNodesInPiBlock(const NodeType &N) = 0;
+
   /// Deallocate memory of edge \p E.
   virtual void destroyEdge(EdgeType &E) { delete &E; }
 
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 57f84c1d0ebf0..5286f6a220ec8 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1922,6 +1922,13 @@ class ScalarEvolutionAnalysis
   ScalarEvolution run(Function &F, FunctionAnalysisManager &AM);
 };
 
+/// Verifier pass for the \c ScalarEvolutionAnalysis results.
+class ScalarEvolutionVerifierPass
+    : public PassInfoMixin<ScalarEvolutionVerifierPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 /// Printer pass for the \c ScalarEvolutionAnalysis results.
 class ScalarEvolutionPrinterPass
     : public PassInfoMixin<ScalarEvolutionPrinterPass> {
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index e9c96cc96cb53..f5f805493d320 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -82,13 +82,36 @@ struct VFParameter {
 struct VFShape {
   unsigned VF;     // Vectorization factor.
   bool IsScalable; // True if the function is a scalable function.
-  VFISAKind ISA;   // Instruction Set Architecture.
   SmallVector<VFParameter, 8> Parameters; // List of parameter informations.
   // Comparison operator.
   bool operator==(const VFShape &Other) const {
-    return std::tie(VF, IsScalable, ISA, Parameters) ==
-           std::tie(Other.VF, Other.IsScalable, Other.ISA, Other.Parameters);
+    return std::tie(VF, IsScalable, Parameters) ==
+           std::tie(Other.VF, Other.IsScalable, Other.Parameters);
   }
+
+  /// Update the parameter in position P.ParamPos to P.
+  void updateParam(VFParameter P) {
+    assert(P.ParamPos < Parameters.size() && "Invalid parameter position.");
+    Parameters[P.ParamPos] = P;
+    assert(hasValidParameterList() && "Invalid parameter list");
+  }
+
+  // Retrieve the basic vectorization shape of the function, where all
+  // parameters are mapped to VFParamKind::Vector with \p EC
+  // lanes. Specifies whether the function has a Global Predicate
+  // argument via \p HasGlobalPred.
+  static VFShape get(const CallInst &CI, ElementCount EC, bool HasGlobalPred) {
+    SmallVector<VFParameter, 8> Parameters;
+    for (unsigned I = 0; I < CI.arg_size(); ++I)
+      Parameters.push_back(VFParameter({I, VFParamKind::Vector}));
+    if (HasGlobalPred)
+      Parameters.push_back(
+          VFParameter({CI.arg_size(), VFParamKind::GlobalPredicate}));
+
+    return {EC.Min, EC.Scalable, Parameters};
+  }
+  /// Sanity check on the Parameters in the VFShape.
+  bool hasValidParameterList() const;
 };
 
 /// Holds the VFShape for a specific scalar to vector function mapping.
@@ -96,11 +119,12 @@ struct VFInfo {
   VFShape Shape;        // Classification of the vector function.
   StringRef ScalarName; // Scalar Function Name.
   StringRef VectorName; // Vector Function Name associated to this VFInfo.
+  VFISAKind ISA;        // Instruction Set Architecture.
 
   // Comparison operator.
   bool operator==(const VFInfo &Other) const {
-    return std::tie(Shape, ScalarName, VectorName) ==
-           std::tie(Shape, Other.ScalarName, Other.VectorName);
+    return std::tie(Shape, ScalarName, VectorName, ISA) ==
+           std::tie(Shape, Other.ScalarName, Other.VectorName, Other.ISA);
   }
 };
 
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 46edfb6260be1..caab91da9c839 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1199,8 +1199,9 @@ enum {
   PT_SUNW_EH_FRAME = 0x6474e550,
   PT_SUNW_UNWIND = 0x6464e550,
 
-  PT_GNU_STACK = 0x6474e551, // Indicates stack executability.
-  PT_GNU_RELRO = 0x6474e552, // Read-only after relocation.
+  PT_GNU_STACK = 0x6474e551,    // Indicates stack executability.
+  PT_GNU_RELRO = 0x6474e552,    // Read-only after relocation.
+  PT_GNU_PROPERTY = 0x6474e553, // .note.gnu.property notes sections.
 
   PT_OPENBSD_RANDOMIZE = 0x65a3dbe6, // Fill with random data.
   PT_OPENBSD_WXNEEDED = 0x65a3dbe7,  // Program does W^X violations.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index d184530d6447d..642f8828b0f57 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -1157,6 +1157,12 @@ class LegalizerInfo {
   virtual bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
                                  MachineIRBuilder &MIRBuilder) const;
 
+  /// Return the opcode (SEXT/ZEXT/ANYEXT) that should be performed while
+  /// widening a constant of type SmallTy which targets can override.
+  /// For eg, the DAG does (SmallTy.isByteSized() ? G_SEXT : G_ZEXT) which
+  /// will be the default.
+  virtual unsigned getExtOpcodeForWideningConstant(LLT SmallTy) const;
+
 private:
   /// Determine what action should be taken to legalize the given generic
   /// instruction opcode, type-index and type. Requires computeTables to have
diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index 503227222207f..149fe043d1f56 100644
--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -38,6 +38,51 @@ enum class MachineCombinerPattern {
   MULSUBX_OP2,
   MULADDXI_OP1,
   MULSUBXI_OP1,
+  // NEON integers vectors
+  MULADDv8i8_OP1,
+  MULADDv8i8_OP2,
+  MULADDv16i8_OP1,
+  MULADDv16i8_OP2,
+  MULADDv4i16_OP1,
+  MULADDv4i16_OP2,
+  MULADDv8i16_OP1,
+  MULADDv8i16_OP2,
+  MULADDv2i32_OP1,
+  MULADDv2i32_OP2,
+  MULADDv4i32_OP1,
+  MULADDv4i32_OP2,
+
+  MULSUBv8i8_OP1,
+  MULSUBv8i8_OP2,
+  MULSUBv16i8_OP1,
+  MULSUBv16i8_OP2,
+  MULSUBv4i16_OP1,
+  MULSUBv4i16_OP2,
+  MULSUBv8i16_OP1,
+  MULSUBv8i16_OP2,
+  MULSUBv2i32_OP1,
+  MULSUBv2i32_OP2,
+  MULSUBv4i32_OP1,
+  MULSUBv4i32_OP2,
+
+  MULADDv4i16_indexed_OP1,
+  MULADDv4i16_indexed_OP2,
+  MULADDv8i16_indexed_OP1,
+  MULADDv8i16_indexed_OP2,
+  MULADDv2i32_indexed_OP1,
+  MULADDv2i32_indexed_OP2,
+  MULADDv4i32_indexed_OP1,
+  MULADDv4i32_indexed_OP2,
+
+  MULSUBv4i16_indexed_OP1,
+  MULSUBv4i16_indexed_OP2,
+  MULSUBv8i16_indexed_OP1,
+  MULSUBv8i16_indexed_OP2,
+  MULSUBv2i32_indexed_OP1,
+  MULSUBv2i32_indexed_OP2,
+  MULSUBv4i32_indexed_OP1,
+  MULSUBv4i32_indexed_OP2,
+
   // Floating Point
   FMULADDH_OP1,
   FMULADDH_OP2,
diff --git a/llvm/include/llvm/CodeGen/MachineInstrBundle.h b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
index 1810d23072d00..517f03e609337 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
@@ -75,12 +75,12 @@ inline MachineBasicBlock::const_instr_iterator getBundleEnd(
 }
 
 //===----------------------------------------------------------------------===//
-// MachineOperand iterator
+// MachineBundleOperand iterator
 //
 
-/// MachineOperandIteratorBase - Iterator that can visit all operands on a
-/// MachineInstr, or all operands on a bundle of MachineInstrs.  This class is
-/// not intended to be used directly, use one of the sub-classes instead.
+/// MIBundleOperandIteratorBase - Iterator that visits all operands in a bundle
+/// of MachineInstrs. This class is not intended to be used directly, use one
+/// of the sub-classes instead.
 ///
 /// Intended use:
 ///
@@ -90,7 +90,10 @@ inline MachineBasicBlock::const_instr_iterator getBundleEnd(
 ///     ...
 ///   }
 ///
-class MachineOperandIteratorBase {
+template <typename ValueT>
+class MIBundleOperandIteratorBase
+    : public iterator_facade_base<MIBundleOperandIteratorBase<ValueT>,
+                                  std::forward_iterator_tag, ValueT> {
   MachineBasicBlock::instr_iterator InstrI, InstrE;
   MachineInstr::mop_iterator OpI, OpE;
 
@@ -99,35 +102,34 @@ class MachineOperandIteratorBase {
   void advance() {
     while (OpI == OpE) {
       // Don't advance off the basic block, or into a new bundle.
-      if (++InstrI == InstrE || !InstrI->isInsideBundle())
+      if (++InstrI == InstrE || !InstrI->isInsideBundle()) {
+        InstrI = InstrE;
         break;
+      }
       OpI = InstrI->operands_begin();
       OpE = InstrI->operands_end();
     }
   }
 
 protected:
-  /// MachineOperandIteratorBase - Create an iterator that visits all operands
+  /// MIBundleOperandIteratorBase - Create an iterator that visits all operands
   /// on MI, or all operands on every instruction in the bundle containing MI.
   ///
   /// @param MI The instruction to examine.
-  /// @param WholeBundle When true, visit all operands on the entire bundle.
   ///
-  explicit MachineOperandIteratorBase(MachineInstr &MI, bool WholeBundle) {
-    if (WholeBundle) {
-      InstrI = getBundleStart(MI.getIterator());
-      InstrE = MI.getParent()->instr_end();
-    } else {
-      InstrI = InstrE = MI.getIterator();
-      ++InstrE;
-    }
+  explicit MIBundleOperandIteratorBase(MachineInstr &MI) {
+    InstrI = getBundleStart(MI.getIterator());
+    InstrE = MI.getParent()->instr_end();
     OpI = InstrI->operands_begin();
     OpE = InstrI->operands_end();
-    if (WholeBundle)
-      advance();
+    advance();
   }
 
-  MachineOperand &deref() const { return *OpI; }
+  /// Constructor for an iterator past the last iteration: both instruction
+  /// iterators point to the end of the BB and OpI == OpE.
+  explicit MIBundleOperandIteratorBase(MachineBasicBlock::instr_iterator InstrE,
+                                       MachineInstr::mop_iterator OpE)
+      : InstrI(InstrE), InstrE(InstrE), OpI(OpE), OpE(OpE) {}
 
 public:
   /// isValid - Returns true until all the operands have been visited.
@@ -140,123 +142,148 @@ class MachineOperandIteratorBase {
     advance();
   }
 
+  ValueT &operator*() const { return *OpI; }
+  ValueT *operator->() const { return &*OpI; }
+
+  bool operator==(const MIBundleOperandIteratorBase &Arg) const {
+    // Iterators are equal, if InstrI matches and either OpIs match or OpI ==
+    // OpE match for both. The second condition allows us to construct an 'end'
+    // iterator, without finding the last instruction in a bundle up-front.
+    return InstrI == Arg.InstrI &&
+           (OpI == Arg.OpI || (OpI == OpE && Arg.OpI == Arg.OpE));
+  }
   /// getOperandNo - Returns the number of the current operand relative to its
   /// instruction.
   ///
   unsigned getOperandNo() const {
     return OpI - InstrI->operands_begin();
   }
-
-  /// VirtRegInfo - Information about a virtual register used by a set of operands.
-  ///
-  struct VirtRegInfo {
-    /// Reads - One of the operands read the virtual register.  This does not
-    /// include undef or internal use operands, see MO::readsReg().
-    bool Reads;
-
-    /// Writes - One of the operands writes the virtual register.
-    bool Writes;
-
-    /// Tied - Uses and defs must use the same register. This can be because of
-    /// a two-address constraint, or there may be a partial redefinition of a
-    /// sub-register.
-    bool Tied;
-  };
-
-  /// Information about how a physical register Reg is used by a set of
-  /// operands.
-  struct PhysRegInfo {
-    /// There is a regmask operand indicating Reg is clobbered.
-    /// \see MachineOperand::CreateRegMask().
-    bool Clobbered;
-
-    /// Reg or one of its aliases is defined. The definition may only cover
-    /// parts of the register.
-    bool Defined;
-    /// Reg or a super-register is defined. The definition covers the full
-    /// register.
-    bool FullyDefined;
-
-    /// Reg or one of its aliases is read. The register may only be read
-    /// partially.
-    bool Read;
-    /// Reg or a super-register is read. The full register is read.
-    bool FullyRead;
-
-    /// Either:
-    /// - Reg is FullyDefined and all defs of reg or an overlapping
-    ///   register are dead, or
-    /// - Reg is completely dead because "defined" by a clobber.
-    bool DeadDef;
-
-    /// Reg is Defined and all defs of reg or an overlapping register are
-    /// dead.
-    bool PartialDeadDef;
-
-    /// There is a use operand of reg or a super-register with kill flag set.
-    bool Killed;
-  };
-
-  /// analyzeVirtReg - Analyze how the current instruction or bundle uses a
-  /// virtual register.  This function should not be called after operator++(),
-  /// it expects a fresh iterator.
-  ///
-  /// @param Reg The virtual register to analyze.
-  /// @param Ops When set, this vector will receive an (MI, OpNum) entry for
-  ///            each operand referring to Reg.
-  /// @returns A filled-in RegInfo struct.
-  VirtRegInfo analyzeVirtReg(unsigned Reg,
-           SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops = nullptr);
-
-  /// analyzePhysReg - Analyze how the current instruction or bundle uses a
-  /// physical register.  This function should not be called after operator++(),
-  /// it expects a fresh iterator.
-  ///
-  /// @param Reg The physical register to analyze.
-  /// @returns A filled-in PhysRegInfo struct.
-  PhysRegInfo analyzePhysReg(unsigned Reg, const TargetRegisterInfo *TRI);
 };
 
-/// MIOperands - Iterate over operands of a single instruction.
+/// MIBundleOperands - Iterate over all operands in a bundle of machine
+/// instructions.
 ///
-class MIOperands : public MachineOperandIteratorBase {
+class MIBundleOperands : public MIBundleOperandIteratorBase<MachineOperand> {
+  /// Constructor for an iterator past the last iteration.
+  MIBundleOperands(MachineBasicBlock::instr_iterator InstrE,
+                   MachineInstr::mop_iterator OpE)
+      : MIBundleOperandIteratorBase(InstrE, OpE) {}
+
 public:
-  MIOperands(MachineInstr &MI) : MachineOperandIteratorBase(MI, false) {}
-  MachineOperand &operator* () const { return deref(); }
-  MachineOperand *operator->() const { return &deref(); }
+  MIBundleOperands(MachineInstr &MI) : MIBundleOperandIteratorBase(MI) {}
+
+  /// Returns an iterator past the last iteration.
+  static MIBundleOperands end(const MachineBasicBlock &MBB) {
+    return {const_cast<MachineBasicBlock &>(MBB).instr_end(),
+            const_cast<MachineBasicBlock &>(MBB).instr_begin()->operands_end()};
+  }
 };
 
-/// ConstMIOperands - Iterate over operands of a single const instruction.
+/// ConstMIBundleOperands - Iterate over all operands in a const bundle of
+/// machine instructions.
 ///
-class ConstMIOperands : public MachineOperandIteratorBase {
+class ConstMIBundleOperands
+    : public MIBundleOperandIteratorBase<const MachineOperand> {
+
+  /// Constructor for an iterator past the last iteration.
+  ConstMIBundleOperands(MachineBasicBlock::instr_iterator InstrE,
+                        MachineInstr::mop_iterator OpE)
+      : MIBundleOperandIteratorBase(InstrE, OpE) {}
+
 public:
-  ConstMIOperands(const MachineInstr &MI)
-      : MachineOperandIteratorBase(const_cast<MachineInstr &>(MI), false) {}
-  const MachineOperand &operator* () const { return deref(); }
-  const MachineOperand *operator->() const { return &deref(); }
+  ConstMIBundleOperands(const MachineInstr &MI)
+      : MIBundleOperandIteratorBase(const_cast<MachineInstr &>(MI)) {}
+
+  /// Returns an iterator past the last iteration.
+  static ConstMIBundleOperands end(const MachineBasicBlock &MBB) {
+    return {const_cast<MachineBasicBlock &>(MBB).instr_end(),
+            const_cast<MachineBasicBlock &>(MBB).instr_begin()->operands_end()};
+  }
 };
 
-/// MIBundleOperands - Iterate over all operands in a bundle of machine
-/// instructions.
+inline iterator_range<ConstMIBundleOperands>
+const_mi_bundle_ops(const MachineInstr &MI) {
+  return make_range(ConstMIBundleOperands(MI),
+                    ConstMIBundleOperands::end(*MI.getParent()));
+}
+
+inline iterator_range<MIBundleOperands> mi_bundle_ops(MachineInstr &MI) {
+  return make_range(MIBundleOperands(MI),
+                    MIBundleOperands::end(*MI.getParent()));
+}
+
+/// VirtRegInfo - Information about a virtual register used by a set of
+/// operands.
 ///
-class MIBundleOperands : public MachineOperandIteratorBase {
-public:
-  MIBundleOperands(MachineInstr &MI) : MachineOperandIteratorBase(MI, true) {}
-  MachineOperand &operator* () const { return deref(); }
-  MachineOperand *operator->() const { return &deref(); }
+struct VirtRegInfo {
+  /// Reads - One of the operands read the virtual register.  This does not
+  /// include undef or internal use operands, see MO::readsReg().
+  bool Reads;
+
+  /// Writes - One of the operands writes the virtual register.
+  bool Writes;
+
+  /// Tied - Uses and defs must use the same register. This can be because of
+  /// a two-address constraint, or there may be a partial redefinition of a
+  /// sub-register.
+  bool Tied;
 };
 
-/// ConstMIBundleOperands - Iterate over all operands in a const bundle of
-/// machine instructions.
+/// AnalyzeVirtRegInBundle - Analyze how the current instruction or bundle uses
+/// a virtual register.  This function should not be called after operator++(),
+/// it expects a fresh iterator.
 ///
-class ConstMIBundleOperands : public MachineOperandIteratorBase {
-public:
-  ConstMIBundleOperands(const MachineInstr &MI)
-      : MachineOperandIteratorBase(const_cast<MachineInstr &>(MI), true) {}
-  const MachineOperand &operator* () const { return deref(); }
-  const MachineOperand *operator->() const { return &deref(); }
+/// @param Reg The virtual register to analyze.
+/// @param Ops When set, this vector will receive an (MI, OpNum) entry for
+///            each operand referring to Reg.
+/// @returns A filled-in RegInfo struct.
+VirtRegInfo AnalyzeVirtRegInBundle(
+    MachineInstr &MI, unsigned Reg,
+    SmallVectorImpl<std::pair<MachineInstr *, unsigned>> *Ops = nullptr);
+
+/// Information about how a physical register Reg is used by a set of
+/// operands.
+struct PhysRegInfo {
+  /// There is a regmask operand indicating Reg is clobbered.
+  /// \see MachineOperand::CreateRegMask().
+  bool Clobbered;
+
+  /// Reg or one of its aliases is defined. The definition may only cover
+  /// parts of the register.
+  bool Defined;
+  /// Reg or a super-register is defined. The definition covers the full
+  /// register.
+  bool FullyDefined;
+
+  /// Reg or one of its aliases is read. The register may only be read
+  /// partially.
+  bool Read;
+  /// Reg or a super-register is read. The full register is read.
+  bool FullyRead;
+
+  /// Either:
+  /// - Reg is FullyDefined and all defs of reg or an overlapping
+  ///   register are dead, or
+  /// - Reg is completely dead because "defined" by a clobber.
+  bool DeadDef;
+
+  /// Reg is Defined and all defs of reg or an overlapping register are
+  /// dead.
+  bool PartialDeadDef;
+
+  /// There is a use operand of reg or a super-register with kill flag set.
+  bool Killed;
 };
 
+/// AnalyzePhysRegInBundle - Analyze how the current instruction or bundle uses
+/// a physical register.  This function should not be called after operator++(),
+/// it expects a fresh iterator.
+///
+/// @param Reg The physical register to analyze.
+/// @returns A filled-in PhysRegInfo struct.
+PhysRegInfo AnalyzePhysRegInBundle(const MachineInstr &MI, unsigned Reg,
+                                   const TargetRegisterInfo *TRI);
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/MachineSizeOpts.h b/llvm/include/llvm/CodeGen/MachineSizeOpts.h
index 75e871d974757..3b02d0860ea15 100644
--- a/llvm/include/llvm/CodeGen/MachineSizeOpts.h
+++ b/llvm/include/llvm/CodeGen/MachineSizeOpts.h
@@ -23,14 +23,16 @@ class MachineBlockFrequencyInfo;
 class MachineFunction;
 
 /// Returns true if machine function \p MF is suggested to be size-optimized
-/// base on the profile.
+/// based on the profile.
 bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI,
-                           const MachineBlockFrequencyInfo *BFI);
+                           const MachineBlockFrequencyInfo *BFI,
+                           PGSOQueryType QueryType = PGSOQueryType::Other);
 /// Returns true if machine basic block \p MBB is suggested to be size-optimized
-/// base on the profile.
+/// based on the profile.
 bool shouldOptimizeForSize(const MachineBasicBlock *MBB,
                            ProfileSummaryInfo *PSI,
-                           const MachineBlockFrequencyInfo *MBFI);
+                           const MachineBlockFrequencyInfo *MBFI,
+                           PGSOQueryType QueryType = PGSOQueryType::Other);
 
 } // end namespace llvm
 
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index c21414760ce9c..4e3451d80572b 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -463,6 +463,9 @@ namespace llvm {
   /// Create Hardware Loop pass. \see HardwareLoops.cpp
   FunctionPass *createHardwareLoopsPass();
 
+  /// Create IR Type Promotion pass. \see TypePromotion.cpp
+  FunctionPass *createTypePromotionPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
index 9ab9e8068eabf..ac001e326c570 100644
--- a/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
+++ b/llvm/include/llvm/CodeGen/ReachingDefAnalysis.h
@@ -87,17 +87,42 @@ class ReachingDefAnalysis : public MachineFunctionPass {
 
   MachineFunctionProperties getRequiredProperties() const override {
     return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
+        MachineFunctionProperties::Property::NoVRegs).set(
+          MachineFunctionProperties::Property::TracksLiveness);
   }
 
   /// Provides the instruction id of the closest reaching def instruction of
   /// PhysReg that reaches MI, relative to the begining of MI's basic block.
   int getReachingDef(MachineInstr *MI, int PhysReg);
 
+  /// Provides the instruction of the closest reaching def instruction of
+  /// PhysReg that reaches MI, relative to the begining of MI's basic block.
+  MachineInstr *getReachingMIDef(MachineInstr *MI, int PhysReg);
+
+  /// Provides the MI, from the given block, corresponding to the Id or a
+  /// nullptr if the id does not refer to the block.
+  MachineInstr *getInstFromId(MachineBasicBlock *MBB, int InstId);
+
+  /// Return whether A and B use the same def of PhysReg.
+  bool hasSameReachingDef(MachineInstr *A, MachineInstr *B, int PhysReg);
+
+  /// Return whether the given register is used after MI, whether it's a local
+  /// use or a live out.
+  bool isRegUsedAfter(MachineInstr *MI, int PhysReg);
+
   /// Provides the clearance - the number of instructions since the closest
   /// reaching def instuction of PhysReg that reaches MI.
   int getClearance(MachineInstr *MI, MCPhysReg PhysReg);
 
+  /// Provides the uses, in the same block as MI, of register that MI defines.
+  /// This does not consider live-outs.
+  void getReachingLocalUses(MachineInstr *MI, int PhysReg,
+                            SmallVectorImpl<MachineInstr*> &Uses);
+
+  /// Provide the number of uses, in the same block as MI, of the register that
+  /// MI defines.
+  unsigned getNumUses(MachineInstr *MI, int PhysReg);
+
 private:
   /// Set up LiveRegs by merging predecessor live-out values.
   void enterBasicBlock(const LoopTraversal::TraversedMBBInfo &TraversedMBB);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index a0e37a19b37dd..3c5675395e114 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -427,7 +427,7 @@ class SelectionDAG {
   const TargetLibraryInfo &getLibInfo() const { return *LibInfo; }
   const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
   const LegacyDivergenceAnalysis *getDivergenceAnalysis() const { return DA; }
-  LLVMContext *getContext() const {return Context; }
+  LLVMContext *getContext() const { return Context; }
   OptimizationRemarkEmitter &getORE() const { return *ORE; }
   ProfileSummaryInfo *getPSI() const { return PSI; }
   BlockFrequencyInfo *getBFI() const { return BFI; }
@@ -1136,14 +1136,19 @@ class SelectionDAG {
   /// Returns sum of the base pointer and offset.
   SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, const SDLoc &DL);
 
-  SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr,
-                        SDValue Mask, SDValue Src0, EVT MemVT,
-                        MachineMemOperand *MMO, ISD::LoadExtType,
-                        bool IsExpanding = false);
+  SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base,
+                        SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT,
+                        MachineMemOperand *MMO, ISD::MemIndexedMode AM,
+                        ISD::LoadExtType, bool IsExpanding = false);
+  SDValue getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl, SDValue Base,
+                               SDValue Offset, ISD::MemIndexedMode AM);
   SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val,
-                         SDValue Ptr, SDValue Mask, EVT MemVT,
-                         MachineMemOperand *MMO, bool IsTruncating = false,
-                         bool IsCompressing = false);
+                         SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT,
+                         MachineMemOperand *MMO, ISD::MemIndexedMode AM,
+                         bool IsTruncating = false, bool IsCompressing = false);
+  SDValue getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
+                                SDValue Base, SDValue Offset,
+                                ISD::MemIndexedMode AM);
   SDValue getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                           ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
                           ISD::MemIndexType IndexType);
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 3b799f967318a..e18278f8cdc61 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -553,6 +553,7 @@ BEGIN_TWO_BYTE_PACK()
 
   class LSBaseSDNodeBitfields {
     friend class LSBaseSDNode;
+    friend class MaskedLoadStoreSDNode;
     friend class MaskedGatherScatterSDNode;
 
     uint16_t : NumMemSDNodeBits;
@@ -560,6 +561,7 @@ BEGIN_TWO_BYTE_PACK()
     // This storage is shared between disparate class hierarchies to hold an
     // enumeration specific to the class hierarchy in use.
     //   LSBaseSDNode => enum ISD::MemIndexedMode
+    //   MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode
     //   MaskedGatherScatterSDNode => enum ISD::MemIndexType
     uint16_t AddressingMode : 3;
   };
@@ -2273,19 +2275,38 @@ class MaskedLoadStoreSDNode : public MemSDNode {
   friend class SelectionDAG;
 
   MaskedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order,
-                        const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+                        const DebugLoc &dl, SDVTList VTs,
+                        ISD::MemIndexedMode AM, EVT MemVT,
                         MachineMemOperand *MMO)
-      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {}
+      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    LSBaseSDNodeBits.AddressingMode = AM;
+    assert(getAddressingMode() == AM && "Value truncated");
+  }
 
-  // MaskedLoadSDNode (Chain, ptr, mask, passthru)
-  // MaskedStoreSDNode (Chain, data, ptr, mask)
+  // MaskedLoadSDNode (Chain, ptr, offset, mask, passthru)
+  // MaskedStoreSDNode (Chain, data, ptr, offset, mask)
   // Mask is a vector of i1 elements
   const SDValue &getBasePtr() const {
     return getOperand(getOpcode() == ISD::MLOAD ? 1 : 2);
   }
-  const SDValue &getMask() const {
+  const SDValue &getOffset() const {
     return getOperand(getOpcode() == ISD::MLOAD ? 2 : 3);
   }
+  const SDValue &getMask() const {
+    return getOperand(getOpcode() == ISD::MLOAD ? 3 : 4);
+  }
+
+  /// Return the addressing mode for this load or store:
+  /// unindexed, pre-inc, pre-dec, post-inc, or post-dec.
+  ISD::MemIndexedMode getAddressingMode() const {
+    return static_cast<ISD::MemIndexedMode>(LSBaseSDNodeBits.AddressingMode);
+  }
+
+  /// Return true if this is a pre/post inc/dec load/store.
+  bool isIndexed() const { return getAddressingMode() != ISD::UNINDEXED; }
+
+  /// Return true if this is NOT a pre/post inc/dec load/store.
+  bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MLOAD ||
@@ -2299,9 +2320,9 @@ class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
   friend class SelectionDAG;
 
   MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                   ISD::LoadExtType ETy, bool IsExpanding, EVT MemVT,
-                   MachineMemOperand *MMO)
-      : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, MemVT, MMO) {
+                   ISD::MemIndexedMode AM, ISD::LoadExtType ETy,
+                   bool IsExpanding, EVT MemVT, MachineMemOperand *MMO)
+      : MaskedLoadStoreSDNode(ISD::MLOAD, Order, dl, VTs, AM, MemVT, MMO) {
     LoadSDNodeBits.ExtTy = ETy;
     LoadSDNodeBits.IsExpanding = IsExpanding;
   }
@@ -2311,8 +2332,9 @@ class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
   }
 
   const SDValue &getBasePtr() const { return getOperand(1); }
-  const SDValue &getMask() const    { return getOperand(2); }
-  const SDValue &getPassThru() const { return getOperand(3); }
+  const SDValue &getOffset() const { return getOperand(2); }
+  const SDValue &getMask() const { return getOperand(3); }
+  const SDValue &getPassThru() const { return getOperand(4); }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MLOAD;
@@ -2327,9 +2349,9 @@ class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
   friend class SelectionDAG;
 
   MaskedStoreSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                    bool isTrunc, bool isCompressing, EVT MemVT,
-                    MachineMemOperand *MMO)
-      : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, MemVT, MMO) {
+                    ISD::MemIndexedMode AM, bool isTrunc, bool isCompressing,
+                    EVT MemVT, MachineMemOperand *MMO)
+      : MaskedLoadStoreSDNode(ISD::MSTORE, Order, dl, VTs, AM, MemVT, MMO) {
     StoreSDNodeBits.IsTruncating = isTrunc;
     StoreSDNodeBits.IsCompressing = isCompressing;
   }
@@ -2345,9 +2367,10 @@ class MaskedStoreSDNode : public MaskedLoadStoreSDNode {
   /// memory at base_addr.
   bool isCompressingStore() const { return StoreSDNodeBits.IsCompressing; }
 
-  const SDValue &getValue() const   { return getOperand(1); }
+  const SDValue &getValue() const { return getOperand(1); }
   const SDValue &getBasePtr() const { return getOperand(2); }
-  const SDValue &getMask() const    { return getOperand(3); }
+  const SDValue &getOffset() const { return getOperand(3); }
+  const SDValue &getMask() const { return getOperand(4); }
 
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MSTORE;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index fa84d0efbdea9..12010d9c74af4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1110,12 +1110,8 @@ class TargetLoweringBase {
   /// Return how the indexed load should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
-  LegalizeAction
-  getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
-    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
-           "Table isn't big enough!");
-    unsigned Ty = (unsigned)VT.SimpleTy;
-    return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
+  LegalizeAction getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
+    return getIndexedModeAction(IdxMode, VT, IMAB_Load);
   }
 
   /// Return true if the specified indexed load is legal on this target.
@@ -1128,12 +1124,8 @@ class TargetLoweringBase {
   /// Return how the indexed store should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
-  LegalizeAction
-  getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
-    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
-           "Table isn't big enough!");
-    unsigned Ty = (unsigned)VT.SimpleTy;
-    return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
+  LegalizeAction getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
+    return getIndexedModeAction(IdxMode, VT, IMAB_Store);
   }
 
   /// Return true if the specified indexed load is legal on this target.
@@ -1143,6 +1135,34 @@ class TargetLoweringBase {
        getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
   }
 
+  /// Return how the indexed load should be treated: either it is legal, needs
+  /// to be promoted to a larger size, needs to be expanded to some other code
+  /// sequence, or the target has a custom expander for it.
+  LegalizeAction getIndexedMaskedLoadAction(unsigned IdxMode, MVT VT) const {
+    return getIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad);
+  }
+
+  /// Return true if the specified indexed load is legal on this target.
+  bool isIndexedMaskedLoadLegal(unsigned IdxMode, EVT VT) const {
+    return VT.isSimple() &&
+           (getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Legal ||
+            getIndexedMaskedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
+  }
+
+  /// Return how the indexed store should be treated: either it is legal, needs
+  /// to be promoted to a larger size, needs to be expanded to some other code
+  /// sequence, or the target has a custom expander for it.
+  LegalizeAction getIndexedMaskedStoreAction(unsigned IdxMode, MVT VT) const {
+    return getIndexedModeAction(IdxMode, VT, IMAB_MaskedStore);
+  }
+
+  /// Return true if the specified indexed load is legal on this target.
+  bool isIndexedMaskedStoreLegal(unsigned IdxMode, EVT VT) const {
+    return VT.isSimple() &&
+           (getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Legal ||
+            getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
+  }
+
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom
   /// expander for it.
@@ -2030,13 +2050,8 @@ class TargetLoweringBase {
   ///
   /// NOTE: All indexed mode loads are initialized to Expand in
   /// TargetLowering.cpp
-  void setIndexedLoadAction(unsigned IdxMode, MVT VT,
-                            LegalizeAction Action) {
-    assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
-           (unsigned)Action < 0xf && "Table isn't big enough!");
-    // Load action are kept in the upper half.
-    IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
-    IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4;
+  void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
+    setIndexedModeAction(IdxMode, VT, IMAB_Load, Action);
   }
 
   /// Indicate that the specified indexed store does or does not work with the
@@ -2044,13 +2059,28 @@ class TargetLoweringBase {
   ///
   /// NOTE: All indexed mode stores are initialized to Expand in
   /// TargetLowering.cpp
-  void setIndexedStoreAction(unsigned IdxMode, MVT VT,
-                             LegalizeAction Action) {
-    assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
-           (unsigned)Action < 0xf && "Table isn't big enough!");
-    // Store action are kept in the lower half.
-    IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
-    IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action);
+  void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action) {
+    setIndexedModeAction(IdxMode, VT, IMAB_Store, Action);
+  }
+
+  /// Indicate that the specified indexed masked load does or does not work with
+  /// the specified type and indicate what to do about it.
+  ///
+  /// NOTE: All indexed mode masked loads are initialized to Expand in
+  /// TargetLowering.cpp
+  void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT,
+                                  LegalizeAction Action) {
+    setIndexedModeAction(IdxMode, VT, IMAB_MaskedLoad, Action);
+  }
+
+  /// Indicate that the specified indexed masked store does or does not work
+  /// with the specified type and indicate what to do about it.
+  ///
+  /// NOTE: All indexed mode masked stores are initialized to Expand in
+  /// TargetLowering.cpp
+  void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT,
+                                   LegalizeAction Action) {
+    setIndexedModeAction(IdxMode, VT, IMAB_MaskedStore, Action);
   }
 
   /// Indicate that the specified condition code is or isn't supported on the
@@ -2763,13 +2793,13 @@ class TargetLoweringBase {
   /// truncating store of a specific value type and truncating type is legal.
   LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
 
-  /// For each indexed mode and each value type, keep a pair of LegalizeAction
+  /// For each indexed mode and each value type, keep a quad of LegalizeAction
   /// that indicates how instruction selection should deal with the load /
-  /// store.
+  /// store / maskedload / maskedstore.
   ///
   /// The first dimension is the value_type for the reference. The second
   /// dimension represents the various modes for load store.
-  uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
+  uint16_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
 
   /// For each condition code (ISD::CondCode) keep a LegalizeAction that
   /// indicates how instruction selection should deal with the condition code.
@@ -2812,6 +2842,32 @@ class TargetLoweringBase {
   /// Set default libcall names and calling conventions.
   void InitLibcalls(const Triple &TT);
 
+  /// The bits of IndexedModeActions used to store the legalisation actions
+  /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
+  enum IndexedModeActionsBits {
+    IMAB_Store = 0,
+    IMAB_Load = 4,
+    IMAB_MaskedStore = 8,
+    IMAB_MaskedLoad = 12
+  };
+
+  void setIndexedModeAction(unsigned IdxMode, MVT VT, unsigned Shift,
+                            LegalizeAction Action) {
+    assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
+           (unsigned)Action < 0xf && "Table isn't big enough!");
+    unsigned Ty = (unsigned)VT.SimpleTy;
+    IndexedModeActions[Ty][IdxMode] &= ~(0xf << Shift);
+    IndexedModeActions[Ty][IdxMode] |= ((uint16_t)Action) << Shift;
+  }
+
+  LegalizeAction getIndexedModeAction(unsigned IdxMode, MVT VT,
+                                      unsigned Shift) const {
+    assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
+           "Table isn't big enough!");
+    unsigned Ty = (unsigned)VT.SimpleTy;
+    return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] >> Shift) & 0xf);
+  }
+
 protected:
   /// Return true if the extension represented by \p I is free.
   /// \pre \p I is a sign, zero, or fp extension and
@@ -3247,9 +3303,7 @@ class TargetLowering : public TargetLoweringBase {
 
     bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
     bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
-    bool isAfterLegalizeDAG() const {
-      return Level == AfterLegalizeDAG;
-    }
+    bool isAfterLegalizeDAG() const { return Level >= AfterLegalizeDAG; }
     CombineLevel getDAGCombineLevel() { return Level; }
     bool isCalledByLegalizer() const { return CalledByLegalizer; }
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
index 2dec107d1458d..f0896b1fc5ae5 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -75,6 +75,7 @@ class DWARFContext : public DIContext {
 
   DWARFUnitVector DWOUnits;
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
+  std::unique_ptr<DWARFDebugMacro> MacroDWO;
 
   /// The maximum DWARF version of all units.
   unsigned MaxVersion = 0;
@@ -271,6 +272,9 @@ class DWARFContext : public DIContext {
   /// Get a pointer to the parsed DebugMacro object.
   const DWARFDebugMacro *getDebugMacro();
 
+  /// Get a pointer to the parsed DebugMacroDWO object.
+  const DWARFDebugMacro *getDebugMacroDWO();
+
   /// Get a reference to the parsed accelerator table object.
   const DWARFDebugNames &getDebugNames();
 
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index 358cacb65afd0..3b141304f85f4 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -34,6 +34,9 @@ struct DWARFLocationEntry {
   /// The second value of the location entry (if applicable).
   uint64_t Value1;
 
+  /// The index of the section this entry is relative to (if applicable).
+  uint64_t SectionIndex;
+
   /// The location expression itself (if applicable).
   SmallVector<uint8_t, 4> Loc;
 };
@@ -60,8 +63,9 @@ class DWARFLocationTable {
   /// updated to point past the end of the current list).
   bool dumpLocationList(uint64_t *Offset, raw_ostream &OS,
                         Optional<object::SectionedAddress> BaseAddr,
-                        const MCRegisterInfo *MRI, DWARFUnit *U,
-                        DIDumpOptions DumpOpts, unsigned Indent) const;
+                        const MCRegisterInfo *MRI, const DWARFObject &Obj,
+                        DWARFUnit *U, DIDumpOptions DumpOpts,
+                        unsigned Indent) const;
 
   Error visitAbsoluteLocationList(
       uint64_t Offset, Optional<object::SectionedAddress> BaseAddr,
@@ -72,7 +76,8 @@ class DWARFLocationTable {
   DWARFDataExtractor Data;
 
   virtual void dumpRawEntry(const DWARFLocationEntry &Entry, raw_ostream &OS,
-                            unsigned Indent) const = 0;
+                            unsigned Indent, DIDumpOptions DumpOpts,
+                            const DWARFObject &Obj) const = 0;
 };
 
 class DWARFDebugLoc final : public DWARFLocationTable {
@@ -98,7 +103,8 @@ class DWARFDebugLoc final : public DWARFLocationTable {
       : DWARFLocationTable(std::move(Data)) {}
 
   /// Print the location lists found within the debug_loc section.
-  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo, DIDumpOptions DumpOpts,
+  void dump(raw_ostream &OS, const MCRegisterInfo *RegInfo,
+            const DWARFObject &Obj, DIDumpOptions DumpOpts,
             Optional<uint64_t> Offset) const;
 
   Error visitLocationList(
@@ -107,7 +113,8 @@ class DWARFDebugLoc final : public DWARFLocationTable {
 
 protected:
   void dumpRawEntry(const DWARFLocationEntry &Entry, raw_ostream &OS,
-                    unsigned Indent) const override;
+                    unsigned Indent, DIDumpOptions DumpOpts,
+                    const DWARFObject &Obj) const override;
 };
 
 class DWARFDebugLoclists final : public DWARFLocationTable {
@@ -121,11 +128,13 @@ class DWARFDebugLoclists final : public DWARFLocationTable {
 
   /// Dump all location lists within the given range.
   void dumpRange(uint64_t StartOffset, uint64_t Size, raw_ostream &OS,
-                 const MCRegisterInfo *MRI, DIDumpOptions DumpOpts);
+                 const MCRegisterInfo *MRI, const DWARFObject &Obj,
+                 DIDumpOptions DumpOpts);
 
 protected:
   void dumpRawEntry(const DWARFLocationEntry &Entry, raw_ostream &OS,
-                    unsigned Indent) const override;
+                    unsigned Indent, DIDumpOptions DumpOpts,
+                    const DWARFObject &Obj) const override;
 
 private:
   uint16_t Version;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
index 952c41e188c7d..88e5432851d67 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugRnglists.h
@@ -45,6 +45,12 @@ struct RangeListEntry : public DWARFListEntryBase {
 /// A class representing a single rangelist.
 class DWARFDebugRnglist : public DWARFListType<RangeListEntry> {
 public:
+  /// Build a DWARFAddressRangesVector from a rangelist.
+  DWARFAddressRangesVector
+  getAbsoluteRanges(Optional<object::SectionedAddress> BaseAddr,
+                    function_ref<Optional<object::SectionedAddress>(uint32_t)>
+                        LookupPooledAddress) const;
+
   /// Build a DWARFAddressRangesVector from a rangelist.
   DWARFAddressRangesVector
   getAbsoluteRanges(llvm::Optional<object::SectionedAddress> BaseAddr,
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
index 9cd34a588c564..fbcde7d7cd788 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFObject.h
@@ -48,6 +48,7 @@ class DWARFObject {
   virtual const DWARFSection &getRangesSection() const { return Dummy; }
   virtual const DWARFSection &getRnglistsSection() const { return Dummy; }
   virtual StringRef getMacinfoSection() const { return ""; }
+  virtual StringRef getMacinfoDWOSection() const { return ""; }
   virtual const DWARFSection &getPubnamesSection() const { return Dummy; }
   virtual const DWARFSection &getPubtypesSection() const { return Dummy; }
   virtual const DWARFSection &getGnuPubnamesSection() const { return Dummy; }
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 98d7a7ee3cae1..36fdd511d1e25 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -427,14 +427,18 @@ class DWARFUnit {
   /// an entry in the rangelist table's offset array and is supplied by
   /// DW_FORM_rnglistx.
   Optional<uint64_t> getRnglistOffset(uint32_t Index) {
-    if (RngListTable)
-      return RngListTable->getOffsetEntry(Index);
+    if (!RngListTable)
+      return None;
+    if (Optional<uint64_t> Off = RngListTable->getOffsetEntry(Index))
+      return *Off + RangeSectionBase;
     return None;
   }
 
   Optional<uint64_t> getLoclistOffset(uint32_t Index) {
-    if (LoclistTableHeader)
-      return LoclistTableHeader->getOffsetEntry(Index);
+    if (!LoclistTableHeader)
+      return None;
+    if (Optional<uint64_t> Off = LoclistTableHeader->getOffsetEntry(Index))
+      return *Off + getLocSectionBase();
     return None;
   }
   Expected<DWARFAddressRangesVector> collectAddressRanges();
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 11599fc1797d8..8bfa5432b8112 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -44,6 +45,7 @@ class LLVMSymbolizer {
     std::vector<std::string> DsymHints;
     std::string FallbackDebugPath;
     std::string DWPName;
+    std::vector<std::string> DebugFileDirectory;
   };
 
   LLVMSymbolizer() = default;
@@ -98,6 +100,9 @@ class LLVMSymbolizer {
   ObjectFile *lookUpDebuglinkObject(const std::string &Path,
                                     const ObjectFile *Obj,
                                     const std::string &ArchName);
+  ObjectFile *lookUpBuildIDObject(const std::string &Path,
+                                  const ELFObjectFileBase *Obj,
+                                  const std::string &ArchName);
 
   /// Returns pair of pointers to object and debug object.
   Expected<ObjectPair> getOrCreateObjectPair(const std::string &Path,
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
index aebd55563e615..7470cca498068 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h
@@ -324,14 +324,14 @@ class Symbol {
   }
 
   static Symbol &constructExternal(void *SymStorage, Addressable &Base,
-                                   StringRef Name, JITTargetAddress Size) {
+                                   StringRef Name, JITTargetAddress Size,
+                                   Linkage L) {
     assert(SymStorage && "Storage cannot be null");
     assert(!Base.isDefined() &&
            "Cannot create external symbol from defined block");
     assert(!Name.empty() && "External symbol name cannot be empty");
     auto *Sym = reinterpret_cast<Symbol *>(SymStorage);
-    new (Sym) Symbol(Base, 0, Name, Size, Linkage::Strong, Scope::Default,
-                     false, false);
+    new (Sym) Symbol(Base, 0, Name, Size, L, Scope::Default, false, false);
     return *Sym;
   }
 
@@ -477,7 +477,7 @@ class Symbol {
 
   /// Set the linkage for this Symbol.
   void setLinkage(Linkage L) {
-    assert((L == Linkage::Strong || (Base->isDefined() && !Name.empty())) &&
+    assert((L == Linkage::Strong || (!Base->isAbsolute() && !Name.empty())) &&
            "Linkage can only be applied to defined named symbols");
     this->L = static_cast<uint8_t>(L);
   }
@@ -849,9 +849,14 @@ class LinkGraph {
   /// Add an external symbol.
   /// Some formats (e.g. ELF) allow Symbols to have sizes. For Symbols whose
   /// size is not known, you should substitute '0'.
-  Symbol &addExternalSymbol(StringRef Name, uint64_t Size) {
-    auto &Sym = Symbol::constructExternal(
-        Allocator.Allocate<Symbol>(), createAddressable(0, false), Name, Size);
+  /// For external symbols Linkage determines whether the symbol must be
+  /// present during lookup: Externals with strong linkage must be found or
+  /// an error will be emitted. Externals with weak linkage are permitted to
+  /// be undefined, in which case they are assigned a value of 0.
+  Symbol &addExternalSymbol(StringRef Name, uint64_t Size, Linkage L) {
+    auto &Sym =
+        Symbol::constructExternal(Allocator.Allocate<Symbol>(),
+                                  createAddressable(0, false), Name, Size, L);
     ExternalSymbols.insert(&Sym);
     return Sym;
   }
@@ -1189,6 +1194,14 @@ struct PassConfiguration {
   LinkGraphPassList PostFixupPasses;
 };
 
+/// Flags for symbol lookup.
+///
+/// FIXME: These basically duplicate orc::SymbolLookupFlags -- We should merge
+///        the two types once we have an OrcSupport library.
+enum class SymbolLookupFlags { RequiredSymbol, WeaklyReferencedSymbol };
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LF);
+
 /// A map of symbol names to resolved addresses.
 using AsyncLookupResult = DenseMap<StringRef, JITEvaluatedSymbol>;
 
@@ -1223,6 +1236,8 @@ createLookupContinuation(Continuation Cont) {
 /// Holds context for a single jitLink invocation.
 class JITLinkContext {
 public:
+  using LookupMap = DenseMap<StringRef, SymbolLookupFlags>;
+
   /// Destroy a JITLinkContext.
   virtual ~JITLinkContext();
 
@@ -1240,7 +1255,7 @@ class JITLinkContext {
   /// Called by JITLink to resolve external symbols. This method is passed a
   /// lookup continutation which it must call with a result to continue the
   /// linking process.
-  virtual void lookup(const DenseSet<StringRef> &Symbols,
+  virtual void lookup(const LookupMap &Symbols,
                       std::unique_ptr<JITLinkAsyncLookupContinuation> LC) = 0;
 
   /// Called by JITLink once all defined symbols in the graph have been assigned
diff --git a/llvm/include/llvm/ExecutionEngine/JITSymbol.h b/llvm/include/llvm/ExecutionEngine/JITSymbol.h
index c0f1ca4b98760..7a2a6cfa52037 100644
--- a/llvm/include/llvm/ExecutionEngine/JITSymbol.h
+++ b/llvm/include/llvm/ExecutionEngine/JITSymbol.h
@@ -41,6 +41,11 @@ class SymbolRef;
 using JITTargetAddress = uint64_t;
 
 /// Convert a JITTargetAddress to a pointer.
+///
+/// Note: This is a raw cast of the address bit pattern to the given pointer
+/// type. When casting to a function pointer in order to execute JIT'd code
+/// jitTargetAddressToFunction should be preferred, as it will also perform
+/// pointer signing on targets that require it.
 template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
   static_assert(std::is_pointer<T>::value, "T must be a pointer type");
   uintptr_t IntPtr = static_cast<uintptr_t>(Addr);
@@ -48,6 +53,19 @@ template <typename T> T jitTargetAddressToPointer(JITTargetAddress Addr) {
   return reinterpret_cast<T>(IntPtr);
 }
 
+/// Convert a JITTargetAddress to a callable function pointer.
+///
+/// Casts the given address to a callable function pointer. This operation
+/// will perform pointer signing for platforms that require it (e.g. arm64e).
+template <typename T> T jitTargetAddressToFunction(JITTargetAddress Addr) {
+  static_assert(
+      std::is_pointer<T>::value &&
+          std::is_function<typename std::remove_pointer<T>::type>::value,
+      "T must be a function pointer type");
+  return jitTargetAddressToPointer<T>(Addr);
+}
+
+/// Convert a pointer to a JITTargetAddress.
 template <typename T> JITTargetAddress pointerToJITTargetAddress(T *Ptr) {
   return static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Ptr));
 }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 4f22a4c387966..2f52edb8de257 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -45,8 +45,11 @@ using VModuleKey = uint64_t;
 //         efficiency).
 using SymbolNameSet = DenseSet<SymbolStringPtr>;
 
+/// A vector of symbol names.
+using SymbolNameVector = std::vector<SymbolStringPtr>;
+
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbols
-///        (address/flags pairs).
+/// (address/flags pairs).
 using SymbolMap = DenseMap<SymbolStringPtr, JITEvaluatedSymbol>;
 
 /// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags.
@@ -55,8 +58,244 @@ using SymbolFlagsMap = DenseMap<SymbolStringPtr, JITSymbolFlags>;
 /// A map from JITDylibs to sets of symbols.
 using SymbolDependenceMap = DenseMap<JITDylib *, SymbolNameSet>;
 
-/// A list of (JITDylib*, bool) pairs.
-using JITDylibSearchList = std::vector<std::pair<JITDylib *, bool>>;
+/// Lookup flags that apply to each dylib in the search order for a lookup.
+///
+/// If MatchHiddenSymbolsOnly is used (the default) for a given dylib, then
+/// only symbols in that Dylib's interface will be searched. If
+/// MatchHiddenSymbols is used then symbols with hidden visibility will match
+/// as well.
+enum class JITDylibLookupFlags { MatchExportedSymbolsOnly, MatchAllSymbols };
+
+/// Lookup flags that apply to each symbol in a lookup.
+///
+/// If RequiredSymbol is used (the default) for a given symbol then that symbol
+/// must be found during the lookup or the lookup will fail returning a
+/// SymbolNotFound error. If WeaklyReferencedSymbol is used and the given
+/// symbol is not found then the query will continue, and no result for the
+/// missing symbol will be present in the result (assuming the rest of the
+/// lookup succeeds).
+enum class SymbolLookupFlags { RequiredSymbol, WeaklyReferencedSymbol };
+
+/// Describes the kind of lookup being performed. The lookup kind is passed to
+/// symbol generators (if they're invoked) to help them determine what
+/// definitions to generate.
+///
+/// Static -- Lookup is being performed as-if at static link time (e.g.
+///           generators representing static archives should pull in new
+///           definitions).
+///
+/// DLSym -- Lookup is being performed as-if at runtime (e.g. generators
+///          representing static archives should not pull in new definitions).
+enum class LookupKind { Static, DLSym };
+
+/// A list of (JITDylib*, JITDylibLookupFlags) pairs to be used as a search
+/// order during symbol lookup.
+using JITDylibSearchOrder =
+    std::vector<std::pair<JITDylib *, JITDylibLookupFlags>>;
+
+/// Convenience function for creating a search order from an ArrayRef of
+/// JITDylib*, all with the same flags.
+inline JITDylibSearchOrder makeJITDylibSearchOrder(
+    ArrayRef<JITDylib *> JDs,
+    JITDylibLookupFlags Flags = JITDylibLookupFlags::MatchExportedSymbolsOnly) {
+  JITDylibSearchOrder O;
+  O.reserve(JDs.size());
+  for (auto *JD : JDs)
+    O.push_back(std::make_pair(JD, Flags));
+  return O;
+}
+
+/// A set of symbols to look up, each associated with a SymbolLookupFlags
+/// value.
+///
+/// This class is backed by a vector and optimized for fast insertion,
+/// deletion and iteration. It does not guarantee a stable order between
+/// operations, and will not automatically detect duplicate elements (they
+/// can be manually checked by calling the validate method).
+class SymbolLookupSet {
+public:
+  using value_type = std::pair<SymbolStringPtr, SymbolLookupFlags>;
+  using UnderlyingVector = std::vector<value_type>;
+  using iterator = UnderlyingVector::iterator;
+  using const_iterator = UnderlyingVector::const_iterator;
+
+  SymbolLookupSet() = default;
+
+  explicit SymbolLookupSet(
+      SymbolStringPtr Name,
+      SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+    add(std::move(Name), Flags);
+  }
+
+  /// Construct a SymbolLookupSet from an initializer list of SymbolStringPtrs.
+  explicit SymbolLookupSet(
+      std::initializer_list<SymbolStringPtr> Names,
+      SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+    Symbols.reserve(Names.size());
+    for (auto &Name : Names)
+      add(std::move(Name), Flags);
+  }
+
+  /// Construct a SymbolLookupSet from a SymbolNameSet with the given
+  /// Flags used for each value.
+  explicit SymbolLookupSet(
+      const SymbolNameSet &Names,
+      SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+    Symbols.reserve(Names.size());
+    for (const auto &Name : Names)
+      add(Name, Flags);
+  }
+
+  /// Construct a SymbolLookupSet from a vector of symbols with the given Flags
+  /// used for each value.
+  /// If the ArrayRef contains duplicates it is up to the client to remove these
+  /// before using this instance for lookup.
+  explicit SymbolLookupSet(
+      ArrayRef<SymbolStringPtr> Names,
+      SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+    Symbols.reserve(Names.size());
+    for (const auto &Name : Names)
+      add(Name, Flags);
+  }
+
+  /// Add an element to the set. The client is responsible for checking that
+  /// duplicates are not added.
+  void add(SymbolStringPtr Name,
+           SymbolLookupFlags Flags = SymbolLookupFlags::RequiredSymbol) {
+    Symbols.push_back(std::make_pair(std::move(Name), Flags));
+  }
+
+  bool empty() const { return Symbols.empty(); }
+  UnderlyingVector::size_type size() const { return Symbols.size(); }
+  iterator begin() { return Symbols.begin(); }
+  iterator end() { return Symbols.end(); }
+  const_iterator begin() const { return Symbols.begin(); }
+  const_iterator end() const { return Symbols.end(); }
+
+  /// Removes the Ith element of the vector, replacing it with the last element.
+  void remove(UnderlyingVector::size_type I) {
+    std::swap(Symbols[I], Symbols.back());
+    Symbols.pop_back();
+  }
+
+  /// Removes the element pointed to by the given iterator. This iterator and
+  /// all subsequent ones (including end()) are invalidated.
+  void remove(iterator I) { remove(I - begin()); }
+
+  /// Removes all elements matching the given predicate, which must be callable
+  /// as bool(const SymbolStringPtr &, SymbolLookupFlags Flags).
+  template <typename PredFn> void remove_if(PredFn &&Pred) {
+    UnderlyingVector::size_type I = 0;
+    while (I != Symbols.size()) {
+      const auto &Name = Symbols[I].first;
+      auto Flags = Symbols[I].second;
+      if (Pred(Name, Flags))
+        remove(I);
+      else
+        ++I;
+    }
+  }
+
+  /// Loop over the elements of this SymbolLookupSet, applying the Body function
+  /// to each one. Body must be callable as
+  /// bool(const SymbolStringPtr &, SymbolLookupFlags).
+  /// If Body returns true then the element just passed in is removed from the
+  /// set. If Body returns false then the element is retained.
+  template <typename BodyFn>
+  auto forEachWithRemoval(BodyFn &&Body) -> typename std::enable_if<
+      std::is_same<decltype(Body(std::declval<const SymbolStringPtr &>(),
+                                 std::declval<SymbolLookupFlags>())),
+                   bool>::value>::type {
+    UnderlyingVector::size_type I = 0;
+    while (I != Symbols.size()) {
+      const auto &Name = Symbols[I].first;
+      auto Flags = Symbols[I].second;
+      if (Body(Name, Flags))
+        remove(I);
+      else
+        ++I;
+    }
+  }
+
+  /// Loop over the elements of this SymbolLookupSet, applying the Body function
+  /// to each one. Body must be callable as
+  /// Expected<bool>(const SymbolStringPtr &, SymbolLookupFlags).
+  /// If Body returns a failure value, the loop exits immediately. If Body
+  /// returns true then the element just passed in is removed from the set. If
+  /// Body returns false then the element is retained.
+  template <typename BodyFn>
+  auto forEachWithRemoval(BodyFn &&Body) -> typename std::enable_if<
+      std::is_same<decltype(Body(std::declval<const SymbolStringPtr &>(),
+                                 std::declval<SymbolLookupFlags>())),
+                   Expected<bool>>::value,
+      Error>::type {
+    UnderlyingVector::size_type I = 0;
+    while (I != Symbols.size()) {
+      const auto &Name = Symbols[I].first;
+      auto Flags = Symbols[I].second;
+      auto Remove = Body(Name, Flags);
+      if (!Remove)
+        return Remove.takeError();
+      if (*Remove)
+        remove(I);
+      else
+        ++I;
+    }
+    return Error::success();
+  }
+
+  /// Construct a SymbolNameVector from this instance by dropping the Flags
+  /// values.
+  SymbolNameVector getSymbolNames() const {
+    SymbolNameVector Names;
+    Names.reserve(Symbols.size());
+    for (auto &KV : Symbols)
+      Names.push_back(KV.first);
+    return Names;
+  }
+
+  /// Sort the lookup set by pointer value. This sort is fast but sensitive to
+  /// allocation order and so should not be used where a consistent order is
+  /// required.
+  void sortByAddress() {
+    llvm::sort(Symbols, [](const value_type &LHS, const value_type &RHS) {
+      return LHS.first < RHS.first;
+    });
+  }
+
+  /// Sort the lookup set lexicographically. This sort is slow but the order
+  /// is unaffected by allocation order.
+  void sortByName() {
+    llvm::sort(Symbols, [](const value_type &LHS, const value_type &RHS) {
+      return *LHS.first < *RHS.first;
+    });
+  }
+
+  /// Remove any duplicate elements. If a SymbolLookupSet is not duplicate-free
+  /// by construction, this method can be used to turn it into a proper set.
+  void removeDuplicates() {
+    sortByAddress();
+    auto LastI = std::unique(Symbols.begin(), Symbols.end());
+    Symbols.erase(LastI, Symbols.end());
+  }
+
+#ifndef NDEBUG
+  /// Returns true if this set contains any duplicates. This should only be used
+  /// in assertions.
+  bool containsDuplicates() {
+    if (Symbols.size() < 2)
+      return false;
+    sortByAddress();
+    for (UnderlyingVector::size_type I = 1; I != Symbols.size(); ++I)
+      if (Symbols[I].first == Symbols[I - 1].first)
+        return true;
+    return true;
+  }
+#endif
+
+private:
+  UnderlyingVector Symbols;
+};
 
 struct SymbolAliasMapEntry {
   SymbolAliasMapEntry() = default;
@@ -76,6 +315,9 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym);
 /// Render a SymbolNameSet.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols);
 
+/// Render a SymbolNameVector.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols);
+
 /// Render a SymbolFlagsMap entry.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV);
 
@@ -98,8 +340,25 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps);
 /// Render a MaterializationUnit.
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU);
 
-/// Render a JITDylibSearchList.
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs);
+//// Render a JITDylibLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibLookupFlags &JDLookupFlags);
+
+/// Rendar a SymbolLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags);
+
+/// Render a JITDylibLookupFlags instance.
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
+
+/// Render a SymbolLookupSet entry.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet::value_type &KV);
+
+/// Render a SymbolLookupSet.
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet);
+
+/// Render a JITDylibSearchOrder.
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibSearchOrder &SearchOrder);
 
 /// Render a SymbolAliasMap.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases);
@@ -107,6 +366,9 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases);
 /// Render a SymbolState.
 raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S);
 
+/// Render a LookupKind.
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K);
+
 /// Callback to notify client that symbols have been resolved.
 using SymbolsResolvedCallback = unique_function<void(Expected<SymbolMap>)>;
 
@@ -139,12 +401,13 @@ class SymbolsNotFound : public ErrorInfo<SymbolsNotFound> {
   static char ID;
 
   SymbolsNotFound(SymbolNameSet Symbols);
+  SymbolsNotFound(SymbolNameVector Symbols);
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
-  const SymbolNameSet &getSymbols() const { return Symbols; }
+  const SymbolNameVector &getSymbols() const { return Symbols; }
 
 private:
-  SymbolNameSet Symbols;
+  SymbolNameVector Symbols;
 };
 
 /// Used to notify clients that a set of symbols could not be removed.
@@ -376,7 +639,8 @@ class ReExportsMaterializationUnit : public MaterializationUnit {
   /// Note: Care must be taken that no sets of aliases form a cycle, as such
   ///       a cycle will result in a deadlock when any symbol in the cycle is
   ///       resolved.
-  ReExportsMaterializationUnit(JITDylib *SourceJD, bool MatchNonExported,
+  ReExportsMaterializationUnit(JITDylib *SourceJD,
+                               JITDylibLookupFlags SourceJDLookupFlags,
                                SymbolAliasMap Aliases, VModuleKey K);
 
   StringRef getName() const override;
@@ -387,7 +651,7 @@ class ReExportsMaterializationUnit : public MaterializationUnit {
   static SymbolFlagsMap extractFlags(const SymbolAliasMap &Aliases);
 
   JITDylib *SourceJD = nullptr;
-  bool MatchNonExported = false;
+  JITDylibLookupFlags SourceJDLookupFlags;
   SymbolAliasMap Aliases;
 };
 
@@ -405,25 +669,26 @@ class ReExportsMaterializationUnit : public MaterializationUnit {
 inline std::unique_ptr<ReExportsMaterializationUnit>
 symbolAliases(SymbolAliasMap Aliases, VModuleKey K = VModuleKey()) {
   return std::make_unique<ReExportsMaterializationUnit>(
-      nullptr, true, std::move(Aliases), std::move(K));
+      nullptr, JITDylibLookupFlags::MatchAllSymbols, std::move(Aliases),
+      std::move(K));
 }
 
 /// Create a materialization unit for re-exporting symbols from another JITDylib
 /// with alternative names/flags.
-/// If MatchNonExported is true then non-exported symbols from SourceJD can be
-/// re-exported. If it is false, attempts to re-export a non-exported symbol
-/// will result in a "symbol not found" error.
+/// SourceJD will be searched using the given JITDylibLookupFlags.
 inline std::unique_ptr<ReExportsMaterializationUnit>
 reexports(JITDylib &SourceJD, SymbolAliasMap Aliases,
-          bool MatchNonExported = false, VModuleKey K = VModuleKey()) {
+          JITDylibLookupFlags SourceJDLookupFlags =
+              JITDylibLookupFlags::MatchExportedSymbolsOnly,
+          VModuleKey K = VModuleKey()) {
   return std::make_unique<ReExportsMaterializationUnit>(
-      &SourceJD, MatchNonExported, std::move(Aliases), std::move(K));
+      &SourceJD, SourceJDLookupFlags, std::move(Aliases), std::move(K));
 }
 
 /// Build a SymbolAliasMap for the common case where you want to re-export
 /// symbols from another JITDylib with the same linkage/flags.
 Expected<SymbolAliasMap>
-buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols);
+buildSimpleReexportsAAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols);
 
 /// Represents the state that a symbol has reached during materialization.
 enum class SymbolState : uint8_t {
@@ -448,7 +713,7 @@ class AsynchronousSymbolQuery {
   /// Create a query for the given symbols. The NotifyComplete
   /// callback will be called once all queried symbols reach the given
   /// minimum state.
-  AsynchronousSymbolQuery(const SymbolNameSet &Symbols,
+  AsynchronousSymbolQuery(const SymbolLookupSet &Symbols,
                           SymbolState RequiredState,
                           SymbolsResolvedCallback NotifyComplete);
 
@@ -456,6 +721,15 @@ class AsynchronousSymbolQuery {
   void notifySymbolMetRequiredState(const SymbolStringPtr &Name,
                                     JITEvaluatedSymbol Sym);
 
+  /// Remove a symbol from the query. This is used to drop weakly referenced
+  /// symbols that are not found.
+  void dropSymbol(const SymbolStringPtr &Name) {
+    assert(ResolvedSymbols.count(Name) &&
+           "Redundant removal of weakly-referenced symbol");
+    ResolvedSymbols.erase(Name);
+    --OutstandingSymbolsCount;
+  }
+
   /// Returns true if all symbols covered by this query have been
   ///        resolved.
   bool isComplete() const { return OutstandingSymbolsCount == 0; }
@@ -497,11 +771,21 @@ class JITDylib {
   friend class ExecutionSession;
   friend class MaterializationResponsibility;
 public:
+  /// Definition generators can be attached to JITDylibs to generate new
+  /// definitions for otherwise unresolved symbols during lookup.
   class DefinitionGenerator {
   public:
     virtual ~DefinitionGenerator();
-    virtual Expected<SymbolNameSet>
-    tryToGenerate(JITDylib &Parent, const SymbolNameSet &Names) = 0;
+
+    /// DefinitionGenerators should override this method to insert new
+    /// definitions into the parent JITDylib. K specifies the kind of this
+    /// lookup. JD specifies the target JITDylib being searched, and
+    /// JDLookupFlags specifies whether the search should match against
+    /// hidden symbols. Finally, Symbols describes the set of unresolved
+    /// symbols and their associated lookup flags.
+    virtual Error tryToGenerate(LookupKind K, JITDylib &JD,
+                                JITDylibLookupFlags JDLookupFlags,
+                                const SymbolLookupSet &LookupSet) = 0;
   };
 
   using AsynchronousSymbolQuerySet =
@@ -552,18 +836,20 @@ class JITDylib {
   /// as the first in the search order (instead of this dylib) ensures that
   /// definitions within this dylib resolve to the lazy-compiling stubs,
   /// rather than immediately materializing the definitions in this dylib.
-  void setSearchOrder(JITDylibSearchList NewSearchOrder,
-                      bool SearchThisJITDylibFirst = true,
-                      bool MatchNonExportedInThisDylib = true);
+  void setSearchOrder(JITDylibSearchOrder NewSearchOrder,
+                      bool SearchThisJITDylibFirst = true);
 
   /// Add the given JITDylib to the search order for definitions in this
   /// JITDylib.
-  void addToSearchOrder(JITDylib &JD, bool MatcNonExported = false);
+  void addToSearchOrder(JITDylib &JD,
+                        JITDylibLookupFlags JDLookupFlags =
+                            JITDylibLookupFlags::MatchExportedSymbolsOnly);
 
   /// Replace OldJD with NewJD in the search order if OldJD is present.
   /// Otherwise this operation is a no-op.
   void replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
-                            bool MatchNonExported = false);
+                            JITDylibLookupFlags JDLookupFlags =
+                                JITDylibLookupFlags::MatchExportedSymbolsOnly);
 
   /// Remove the given JITDylib from the search order for this JITDylib if it is
   /// present. Otherwise this operation is a no-op.
@@ -572,7 +858,7 @@ class JITDylib {
   /// Do something with the search order (run under the session lock).
   template <typename Func>
   auto withSearchOrderDo(Func &&F)
-      -> decltype(F(std::declval<const JITDylibSearchList &>()));
+      -> decltype(F(std::declval<const JITDylibSearchOrder &>()));
 
   /// Define all symbols provided by the materialization unit to be part of this
   /// JITDylib.
@@ -605,8 +891,11 @@ class JITDylib {
   Error remove(const SymbolNameSet &Names);
 
   /// Search the given JITDylib for the symbols in Symbols. If found, store
-  ///        the flags for each symbol in Flags. Returns any unresolved symbols.
-  Expected<SymbolFlagsMap> lookupFlags(const SymbolNameSet &Names);
+  /// the flags for each symbol in Flags. If any required symbols are not found
+  /// then an error will be returned.
+  Expected<SymbolFlagsMap> lookupFlags(LookupKind K,
+                                       JITDylibLookupFlags JDLookupFlags,
+                                       SymbolLookupSet LookupSet);
 
   /// Dump current JITDylib state to OS.
   void dump(raw_ostream &OS);
@@ -709,20 +998,23 @@ class JITDylib {
 
   Error defineImpl(MaterializationUnit &MU);
 
-  Expected<SymbolNameSet> lookupFlagsImpl(SymbolFlagsMap &Flags,
-                                          const SymbolNameSet &Names);
+  void lookupFlagsImpl(SymbolFlagsMap &Result, LookupKind K,
+                       JITDylibLookupFlags JDLookupFlags,
+                       SymbolLookupSet &Unresolved);
 
-  Error lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                   SymbolNameSet &Unresolved, bool MatchNonExported,
-                   MaterializationUnitList &MUs);
+  Error lodgeQuery(MaterializationUnitList &MUs,
+                   std::shared_ptr<AsynchronousSymbolQuery> &Q, LookupKind K,
+                   JITDylibLookupFlags JDLookupFlags,
+                   SymbolLookupSet &Unresolved);
 
-  Error lodgeQueryImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                       SymbolNameSet &Unresolved, bool MatchNonExported,
-                       MaterializationUnitList &MUs);
+  Error lodgeQueryImpl(MaterializationUnitList &MUs,
+                       std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                       LookupKind K, JITDylibLookupFlags JDLookupFlags,
+                       SymbolLookupSet &Unresolved);
 
   bool lookupImpl(std::shared_ptr<AsynchronousSymbolQuery> &Q,
                   std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
-                  SymbolNameSet &Unresolved);
+                  SymbolLookupSet &Unresolved);
 
   void detachQueryHelper(AsynchronousSymbolQuery &Q,
                          const SymbolNameSet &QuerySymbols);
@@ -754,7 +1046,7 @@ class JITDylib {
   UnmaterializedInfosMap UnmaterializedInfos;
   MaterializingInfosMap MaterializingInfos;
   std::vector<std::unique_ptr<DefinitionGenerator>> DefGenerators;
-  JITDylibSearchList SearchOrder;
+  JITDylibSearchOrder SearchOrder;
 };
 
 /// An ExecutionSession represents a running JIT program.
@@ -787,10 +1079,6 @@ class ExecutionSession {
     return F();
   }
 
-  /// Get the "main" JITDylib, which is created automatically on construction of
-  /// the ExecutionSession.
-  JITDylib &getMainJITDylib();
-
   /// Return a pointer to the "name" JITDylib.
   /// Ownership of JITDylib remains within Execution Session
   JITDylib *getJITDylibByName(StringRef Name);
@@ -800,8 +1088,7 @@ class ExecutionSession {
   /// The JITDylib Name is required to be unique. Clients should verify that
   /// names are not being re-used (e.g. by calling getJITDylibByName) if names
   /// are based on user input.
-  JITDylib &createJITDylib(std::string Name,
-                           bool AddToMainDylibSearchOrder = true);
+  JITDylib &createJITDylib(std::string Name);
 
   /// Allocate a module key for a new module to add to the JIT.
   VModuleKey allocateVModule() {
@@ -863,8 +1150,9 @@ class ExecutionSession {
   /// dependenant symbols for this query (e.g. it is being made by a top level
   /// client to get an address to call) then the value NoDependenciesToRegister
   /// can be used.
-  void lookup(const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
-              SymbolState RequiredState, SymbolsResolvedCallback NotifyComplete,
+  void lookup(LookupKind K, const JITDylibSearchOrder &SearchOrder,
+              SymbolLookupSet Symbols, SymbolState RequiredState,
+              SymbolsResolvedCallback NotifyComplete,
               RegisterDependenciesFunction RegisterDependencies);
 
   /// Blocking version of lookup above. Returns the resolved symbol map.
@@ -874,8 +1162,9 @@ class ExecutionSession {
   /// or an error occurs. If WaitUntilReady is false and an error occurs
   /// after resolution, the function will return a success value, but the
   /// error will be reported via reportErrors.
-  Expected<SymbolMap> lookup(const JITDylibSearchList &SearchOrder,
-                             const SymbolNameSet &Symbols,
+  Expected<SymbolMap> lookup(const JITDylibSearchOrder &SearchOrder,
+                             const SymbolLookupSet &Symbols,
+                             LookupKind K = LookupKind::Static,
                              SymbolState RequiredState = SymbolState::Ready,
                              RegisterDependenciesFunction RegisterDependencies =
                                  NoDependenciesToRegister);
@@ -883,7 +1172,7 @@ class ExecutionSession {
   /// Convenience version of blocking lookup.
   /// Searches each of the JITDylibs in the search order in turn for the given
   /// symbol.
-  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchList &SearchOrder,
+  Expected<JITEvaluatedSymbol> lookup(const JITDylibSearchOrder &SearchOrder,
                                       SymbolStringPtr Symbol);
 
   /// Convenience version of blocking lookup.
@@ -951,7 +1240,7 @@ GeneratorT &JITDylib::addGenerator(std::unique_ptr<GeneratorT> DefGenerator) {
 
 template <typename Func>
 auto JITDylib::withSearchOrderDo(Func &&F)
-    -> decltype(F(std::declval<const JITDylibSearchList &>())) {
+    -> decltype(F(std::declval<const JITDylibSearchOrder &>())) {
   return ES.runSessionLocked([&]() { return F(SearchOrder); });
 }
 
@@ -997,15 +1286,17 @@ class ReexportsGenerator : public JITDylib::DefinitionGenerator {
   /// Create a reexports generator. If an Allow predicate is passed, only
   /// symbols for which the predicate returns true will be reexported. If no
   /// Allow predicate is passed, all symbols will be exported.
-  ReexportsGenerator(JITDylib &SourceJD, bool MatchNonExported = false,
+  ReexportsGenerator(JITDylib &SourceJD,
+                     JITDylibLookupFlags SourceJDLookupFlags,
                      SymbolPredicate Allow = SymbolPredicate());
 
-  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
-                                        const SymbolNameSet &Names) override;
+  Error tryToGenerate(LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &LookupSet) override;
 
 private:
   JITDylib &SourceJD;
-  bool MatchNonExported = false;
+  JITDylibLookupFlags SourceJDLookupFlags;
   SymbolPredicate Allow;
 };
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index b9bbace6f6308..c797dbbbdfd97 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -40,6 +40,17 @@ namespace orc {
 
 class ObjectLayer;
 
+/// Run a main function, returning the result.
+///
+/// If the optional ProgramName argument is given then it will be inserted
+/// before the strings in Args as the first argument to the called function.
+///
+/// It is legal to have an empty argument list and no program name, however
+/// many main functions will expect a name argument at least, and will fail
+/// if none is provided.
+int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
+              Optional<StringRef> ProgramName = None);
+
 /// This iterator provides a convenient way to iterate over the elements
 ///        of an llvm.global_ctors/llvm.global_dtors instance.
 ///
@@ -268,8 +279,9 @@ class DynamicLibrarySearchGenerator : public JITDylib::DefinitionGenerator {
     return Load(nullptr, GlobalPrefix, std::move(Allow));
   }
 
-  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
-                                        const SymbolNameSet &Names) override;
+  Error tryToGenerate(LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
 
 private:
   sys::DynamicLibrary Dylib;
@@ -297,8 +309,9 @@ class StaticLibraryDefinitionGenerator : public JITDylib::DefinitionGenerator {
   static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
   Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer);
 
-  Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
-                                        const SymbolNameSet &Names) override;
+  Error tryToGenerate(LookupKind K, JITDylib &JD,
+                      JITDylibLookupFlags JDLookupFlags,
+                      const SymbolLookupSet &Symbols) override;
 
 private:
   StaticLibraryDefinitionGenerator(ObjectLayer &L,
@@ -307,8 +320,7 @@ class StaticLibraryDefinitionGenerator : public JITDylib::DefinitionGenerator {
 
   ObjectLayer &L;
   std::unique_ptr<MemoryBuffer> ArchiveBuffer;
-  object::Archive Archive;
-  size_t UnrealizedObjects = 0;
+  std::unique_ptr<object::Archive> Archive;
 };
 
 } // end namespace orc
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
index bcbd72e68f154..c8c4ecdaff160 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h
@@ -91,6 +91,12 @@ class JITTargetMachineBuilder {
     return *this;
   }
 
+  /// Set subtarget features.
+  JITTargetMachineBuilder &setFeatures(StringRef FeatureString) {
+    Features = SubtargetFeatures(FeatureString);
+    return *this;
+  }
+
   /// Add subtarget features.
   JITTargetMachineBuilder &
   addFeatures(const std::vector<std::string> &FeatureVec);
@@ -101,6 +107,17 @@ class JITTargetMachineBuilder {
   /// Access subtarget features.
   const SubtargetFeatures &getFeatures() const { return Features; }
 
+  /// Set TargetOptions.
+  ///
+  /// Note: This operation will overwrite any previously configured options,
+  /// including EmulatedTLS and ExplicitEmulatedTLS which
+  /// the JITTargetMachineBuilder sets by default. Clients are responsible
+  /// for re-enabling these overwritten options.
+  JITTargetMachineBuilder &setOptions(TargetOptions Options) {
+    this->Options = std::move(Options);
+    return *this;
+  }
+
   /// Access TargetOptions.
   TargetOptions &getOptions() { return Options; }
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
index 766a6b070f12f..f6b86bb231678 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h
@@ -100,23 +100,27 @@ class Speculator {
       SymbolsInJD.insert(ImplSymbolName);
     }
 
-    DEBUG_WITH_TYPE("orc", for (auto &I
-                                : SpeculativeLookUpImpls) {
-      llvm::dbgs() << "\n In " << I.first->getName() << " JITDylib ";
-      for (auto &N : I.second)
-        llvm::dbgs() << "\n Likely Symbol : " << N;
+    DEBUG_WITH_TYPE("orc", {
+      for (auto &I : SpeculativeLookUpImpls) {
+        llvm::dbgs() << "\n In " << I.first->getName() << " JITDylib ";
+        for (auto &N : I.second)
+          llvm::dbgs() << "\n Likely Symbol : " << N;
+      }
     });
 
     // for a given symbol, there may be no symbol qualified for speculatively
     // compile try to fix this before jumping to this code if possible.
     for (auto &LookupPair : SpeculativeLookUpImpls)
-      ES.lookup(JITDylibSearchList({{LookupPair.first, true}}),
-                LookupPair.second, SymbolState::Ready,
-                [this](Expected<SymbolMap> Result) {
-                  if (auto Err = Result.takeError())
-                    ES.reportError(std::move(Err));
-                },
-                NoDependenciesToRegister);
+      ES.lookup(
+          LookupKind::Static,
+          makeJITDylibSearchOrder(LookupPair.first,
+                                  JITDylibLookupFlags::MatchAllSymbols),
+          SymbolLookupSet(LookupPair.second), SymbolState::Ready,
+          [this](Expected<SymbolMap> Result) {
+            if (auto Err = Result.takeError())
+              ES.reportError(std::move(Err));
+          },
+          NoDependenciesToRegister);
   }
 
 public:
@@ -151,8 +155,11 @@ class Speculator {
           this->getES().reportError(ReadySymbol.takeError());
       };
       // Include non-exported symbols also.
-      ES.lookup(JITDylibSearchList({{JD, true}}), SymbolNameSet({Target}),
-                SymbolState::Ready, OnReadyFixUp, NoDependenciesToRegister);
+      ES.lookup(
+          LookupKind::Static,
+          makeJITDylibSearchOrder(JD, JITDylibLookupFlags::MatchAllSymbols),
+          SymbolLookupSet(Target, SymbolLookupFlags::WeaklyReferencedSymbol),
+          SymbolState::Ready, OnReadyFixUp, NoDependenciesToRegister);
     }
   }
 
diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h
index ad9a35b554144..7ea0c95612403 100644
--- a/llvm/include/llvm/IR/DIBuilder.h
+++ b/llvm/include/llvm/IR/DIBuilder.h
@@ -237,8 +237,10 @@ namespace llvm {
     /// \param File        File where this type is defined.
     /// \param LineNo      Line number.
     /// \param Context     The surrounding context for the typedef.
+    /// \param AlignInBits Alignment. (optional)
     DIDerivedType *createTypedef(DIType *Ty, StringRef Name, DIFile *File,
-                                 unsigned LineNo, DIScope *Context);
+                                 unsigned LineNo, DIScope *Context,
+                                 uint32_t AlignInBits = 0);
 
     /// Create debugging information entry for a 'friend'.
     DIDerivedType *createFriend(DIType *Ty, DIType *FriendTy);
diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h
index 0e43a05b318eb..d690ccece5fa6 100644
--- a/llvm/include/llvm/IR/DebugInfoMetadata.h
+++ b/llvm/include/llvm/IR/DebugInfoMetadata.h
@@ -3253,6 +3253,89 @@ class DIMacroFile : public DIMacroNode {
   }
 };
 
+/// Identifies a unique instance of a variable.
+///
+/// Storage for identifying a potentially inlined instance of a variable,
+/// or a fragment thereof. This guarantees that exactly one variable instance
+/// may be identified by this class, even when that variable is a fragment of
+/// an aggregate variable and/or there is another inlined instance of the same
+/// source code variable nearby.
+/// This class does not necessarily uniquely identify that variable: it is
+/// possible that a DebugVariable with different parameters may point to the
+/// same variable instance, but not that one DebugVariable points to multiple
+/// variable instances.
+class DebugVariable {
+  using FragmentInfo = DIExpression::FragmentInfo;
+
+  const DILocalVariable *Variable;
+  Optional<FragmentInfo> Fragment;
+  const DILocation *InlinedAt;
+
+  /// Fragment that will overlap all other fragments. Used as default when
+  /// caller demands a fragment.
+  static const FragmentInfo DefaultFragment;
+
+public:
+  DebugVariable(const DILocalVariable *Var, Optional<FragmentInfo> FragmentInfo,
+                const DILocation *InlinedAt)
+      : Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}
+
+  DebugVariable(const DILocalVariable *Var, const DIExpression *DIExpr,
+                const DILocation *InlinedAt)
+      : Variable(Var),
+        Fragment(DIExpr ? DIExpr->getFragmentInfo() : NoneType()),
+        InlinedAt(InlinedAt) {}
+
+  const DILocalVariable *getVariable() const { return Variable; }
+  const Optional<FragmentInfo> getFragment() const { return Fragment; }
+  const DILocation *getInlinedAt() const { return InlinedAt; }
+
+  const FragmentInfo getFragmentOrDefault() const {
+    return Fragment.getValueOr(DefaultFragment);
+  }
+
+  static bool isDefaultFragment(const FragmentInfo F) {
+    return F == DefaultFragment;
+  }
+
+  bool operator==(const DebugVariable &Other) const {
+    return std::tie(Variable, Fragment, InlinedAt) ==
+           std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
+  }
+
+  bool operator<(const DebugVariable &Other) const {
+    return std::tie(Variable, Fragment, InlinedAt) <
+           std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
+  }
+};
+
+template <> struct DenseMapInfo<DebugVariable> {
+  using FragmentInfo = DIExpression::FragmentInfo;
+
+  /// Empty key: no key should be generated that has no DILocalVariable.
+  static inline DebugVariable getEmptyKey() {
+    return DebugVariable(nullptr, NoneType(), nullptr);
+  }
+
+  /// Difference in tombstone is that the Optional is meaningful.
+  static inline DebugVariable getTombstoneKey() {
+    return DebugVariable(nullptr, {{0, 0}}, nullptr);
+  }
+
+  static unsigned getHashValue(const DebugVariable &D) {
+    unsigned HV = 0;
+    const Optional<FragmentInfo> Fragment = D.getFragment();
+    if (Fragment)
+      HV = DenseMapInfo<FragmentInfo>::getHashValue(*Fragment);
+
+    return hash_combine(D.getVariable(), HV, D.getInlinedAt());
+  }
+
+  static bool isEqual(const DebugVariable &A, const DebugVariable &B) {
+    return A == B;
+  }
+};
+
 } // end namespace llvm
 
 #undef DEFINE_MDNODE_GET_UNPACK_IMPL
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 2d9c72108d3d4..24d39c2bc526f 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -265,7 +265,6 @@ class IRBuilderBase {
   void setConstrainedFPCallAttr(CallInst *I) {
     if (!I->hasFnAttr(Attribute::StrictFP))
       I->addAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
-    setConstrainedFPFunctionAttr();
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 836911128ec46..1edce65c9ce67 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -446,6 +446,10 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
   def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlal2 : AdvSIMD_FP16FML_Intrinsic;
   def int_aarch64_neon_fmlsl2 : AdvSIMD_FP16FML_Intrinsic;
+
+  // v8.3-A Floating-point complex add
+  def int_aarch64_neon_vcadd_rot90  : AdvSIMD_2VectorArg_Intrinsic;
+  def int_aarch64_neon_vcadd_rot270 : AdvSIMD_2VectorArg_Intrinsic;
 }
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
@@ -786,6 +790,21 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty],
                 [IntrNoMem]>;
 
+  class AdvSIMD_Pred2VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
+  class AdvSIMD_Pred3VectorArg_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>,
+                 LLVMMatchType<0>,
+                 LLVMMatchType<0>],
+                [IntrNoMem]>;
+
   class AdvSIMD_SVE_Compare_Intrinsic
     : Intrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -813,6 +832,20 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_anyvector_ty],
                 [IntrNoMem]>;
 
+  class AdvSIMD_SVE_ShiftByImm_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>,
+                 llvm_i32_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_ShiftWide_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 LLVMMatchType<0>,
+                 llvm_nxv2i64_ty],
+                [IntrNoMem]>;
+
   class AdvSIMD_SVE_Unpack_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                [LLVMSubdivide2VectorType<0>],
@@ -849,6 +882,26 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class AdvSIMD_SVE_FCVT_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_FCVTZS_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMVectorOfBitcastsToInt<0>,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 llvm_anyvector_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_INSR_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMVectorElementType<0>],
+                [IntrNoMem]>;
+
   class AdvSIMD_SVE_PUNPKHI_Intrinsic
     : Intrinsic<[LLVMHalfElementsVectorType<0>],
                 [llvm_anyvector_ty],
@@ -861,12 +914,29 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class AdvSIMD_SVE_SCVTF_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 llvm_anyvector_ty],
+                [IntrNoMem]>;
+
   class AdvSIMD_SVE_TSMUL_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
                  LLVMVectorOfBitcastsToInt<0>],
                 [IntrNoMem]>;
 
+  class AdvSIMD_SVE_CNTB_Intrinsic
+    : Intrinsic<[llvm_i64_ty],
+                [llvm_i32_ty],
+                [IntrNoMem]>;
+
+  class AdvSIMD_SVE_CNTP_Intrinsic
+    : Intrinsic<[llvm_i64_ty],
+                [llvm_anyvector_ty, LLVMMatchType<0>],
+                [IntrNoMem]>;
+
   class AdvSIMD_SVE_DOT_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMMatchType<0>,
@@ -882,6 +952,42 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                  llvm_i32_ty],
                 [IntrNoMem]>;
 
+class AdvSIMD_GatherLoad_64bitOffset_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [
+                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                  LLVMPointerToElt<0>,
+                  LLVMScalarOrSameVectorWidth<0, llvm_i64_ty>
+                ],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+  class SVE2_3VectorArg_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide2VectorType<0>,
+                 LLVMSubdivide2VectorType<0>],
+                [IntrNoMem]>;
+
+  class SVE2_3VectorArgIndexed_Long_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [LLVMMatchType<0>,
+                 LLVMSubdivide2VectorType<0>,
+                 LLVMSubdivide2VectorType<0>,
+                 llvm_i32_ty],
+                [IntrNoMem]>;
+
+  // NOTE: There is no relationship between these intrinsics beyond an attempt
+  // to reuse currently identical class definitions.
+  class AdvSIMD_SVE_LOGB_Intrinsic  : AdvSIMD_SVE_CNT_Intrinsic;
+
+class AdvSIMD_GatherLoad_32bitOffset_Intrinsic
+    : Intrinsic<[ llvm_anyvector_ty ],
+                [
+                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                  LLVMPointerToElt<0>, llvm_anyvector_ty
+                ],
+                [ IntrReadMem, IntrArgMemOnly ]>;
+
   // This class of intrinsics are not intended to be useful within LLVM IR but
   // are instead here to support some of the more regid parts of the ACLE.
   class Builtin_SVCVT<string name, LLVMType OUT, LLVMType IN>
@@ -893,18 +999,19 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
 // SVE
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
-
-
-class AdvSIMD_Pred2VectorArg_Intrinsic
+  class AdvSIMD_SVE_WHILE_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
-             [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>],
-             [IntrNoMem]>;
+                [llvm_anyint_ty, LLVMMatchType<1>],
+                [IntrNoMem]>;
 
-class AdvSIMD_Pred3VectorArg_Intrinsic
+class AdvSIMD_GatherLoad_VecTorBase_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
-             [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-             [IntrNoMem]>;
-
+                [
+                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                  llvm_anyvector_ty,
+                  llvm_i64_ty
+                ],
+                [IntrReadMem, IntrArgMemOnly]>;
 
 //
 // Integer arithmetic
@@ -914,12 +1021,6 @@ def int_aarch64_sve_add   : AdvSIMD_Pred2VectorArg_Intrinsic;
 def int_aarch64_sve_sub   : AdvSIMD_Pred2VectorArg_Intrinsic;
 def int_aarch64_sve_subr  : AdvSIMD_Pred2VectorArg_Intrinsic;
 
-def int_aarch64_sve_and        : AdvSIMD_Pred2VectorArg_Intrinsic;
-def int_aarch64_sve_or         : AdvSIMD_Pred2VectorArg_Intrinsic;
-def int_aarch64_sve_xor        : AdvSIMD_Pred2VectorArg_Intrinsic;
-def int_aarch64_sve_bic        : AdvSIMD_2VectorArg_Intrinsic;
-def int_aarch64_sve_bic_pred   : AdvSIMD_Pred2VectorArg_Intrinsic;
-
 def int_aarch64_sve_mul        : AdvSIMD_Pred2VectorArg_Intrinsic;
 def int_aarch64_sve_smulh      : AdvSIMD_Pred2VectorArg_Intrinsic;
 def int_aarch64_sve_umulh      : AdvSIMD_Pred2VectorArg_Intrinsic;
@@ -950,6 +1051,17 @@ def int_aarch64_sve_sdot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
 def int_aarch64_sve_udot      : AdvSIMD_SVE_DOT_Intrinsic;
 def int_aarch64_sve_udot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
 
+// Shifts
+
+def int_aarch64_sve_asr      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_asr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic;
+def int_aarch64_sve_asrd     : AdvSIMD_SVE_ShiftByImm_Intrinsic;
+def int_aarch64_sve_insr     : AdvSIMD_SVE_INSR_Intrinsic;
+def int_aarch64_sve_lsl      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_lsl_wide : AdvSIMD_SVE_ShiftWide_Intrinsic;
+def int_aarch64_sve_lsr      : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_lsr_wide : AdvSIMD_SVE_ShiftWide_Intrinsic;
+
 //
 // Counting bits
 //
@@ -958,6 +1070,26 @@ def int_aarch64_sve_cls : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_clz : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_cnt : AdvSIMD_SVE_CNT_Intrinsic;
 
+//
+// Counting elements
+//
+
+def int_aarch64_sve_cntb : AdvSIMD_SVE_CNTB_Intrinsic;
+def int_aarch64_sve_cnth : AdvSIMD_SVE_CNTB_Intrinsic;
+def int_aarch64_sve_cntw : AdvSIMD_SVE_CNTB_Intrinsic;
+def int_aarch64_sve_cntd : AdvSIMD_SVE_CNTB_Intrinsic;
+
+def int_aarch64_sve_cntp : AdvSIMD_SVE_CNTP_Intrinsic;
+
+//
+// Reversal
+//
+
+def int_aarch64_sve_rbit : AdvSIMD_Merged1VectorArg_Intrinsic;
+def int_aarch64_sve_revb : AdvSIMD_Merged1VectorArg_Intrinsic;
+def int_aarch64_sve_revh : AdvSIMD_Merged1VectorArg_Intrinsic;
+def int_aarch64_sve_revw : AdvSIMD_Merged1VectorArg_Intrinsic;
+
 //
 // Permutations and selection
 //
@@ -975,6 +1107,25 @@ def int_aarch64_sve_uunpklo   : AdvSIMD_SVE_Unpack_Intrinsic;
 def int_aarch64_sve_cnot : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_not  : AdvSIMD_Merged1VectorArg_Intrinsic;
 
+def int_aarch64_sve_and         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_or          : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_xor         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_bic_base    : AdvSIMD_2VectorArg_Intrinsic;
+def int_aarch64_sve_bic         : AdvSIMD_Pred2VectorArg_Intrinsic;
+
+def int_aarch64_sve_eor         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_ands        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_bics        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_eors        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_orr         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_orn         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_nor         : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_nand        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_orrs        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_orns        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_nors        : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_nands       : AdvSIMD_Pred2VectorArg_Intrinsic;
+
 //
 // Conversion
 //
@@ -986,6 +1137,19 @@ def int_aarch64_sve_uxtb : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_uxth : AdvSIMD_Merged1VectorArg_Intrinsic;
 def int_aarch64_sve_uxtw : AdvSIMD_Merged1VectorArg_Intrinsic;
 
+//
+// While comparisons
+//
+
+def int_aarch64_sve_whilele : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilelo : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilels : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilelt : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilege : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilegt : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilehs : AdvSIMD_SVE_WHILE_Intrinsic;
+def int_aarch64_sve_whilehi : AdvSIMD_SVE_WHILE_Intrinsic;
+
 //
 // Floating-point arithmetic
 //
@@ -1048,6 +1212,16 @@ def int_aarch64_sve_fmaxnmv : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminv   : AdvSIMD_SVE_Reduce_Intrinsic;
 def int_aarch64_sve_fminnmv : AdvSIMD_SVE_Reduce_Intrinsic;
 
+//
+// Floating-point conversions
+//
+
+def int_aarch64_sve_fcvt   : AdvSIMD_SVE_FCVT_Intrinsic;
+def int_aarch64_sve_fcvtzs : AdvSIMD_SVE_FCVTZS_Intrinsic;
+def int_aarch64_sve_fcvtzu : AdvSIMD_SVE_FCVTZS_Intrinsic;
+def int_aarch64_sve_scvtf  : AdvSIMD_SVE_SCVTF_Intrinsic;
+def int_aarch64_sve_ucvtf  : AdvSIMD_SVE_SCVTF_Intrinsic;
+
 //
 // Floating-point comparisons
 //
@@ -1061,7 +1235,41 @@ def int_aarch64_sve_fcmpgt : AdvSIMD_SVE_Compare_Intrinsic;
 def int_aarch64_sve_fcmpne : AdvSIMD_SVE_Compare_Intrinsic;
 def int_aarch64_sve_fcmpuo : AdvSIMD_SVE_Compare_Intrinsic;
 
-def int_aarch64_sve_fcvtzs_i32f16 : Builtin_SVCVT<"svcvt_s32_f16_m", llvm_nxv4i32_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzs_i32f16   : Builtin_SVCVT<"svcvt_s32_f16_m", llvm_nxv4i32_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzs_i32f64   : Builtin_SVCVT<"svcvt_s32_f64_m", llvm_nxv4i32_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtzs_i64f16   : Builtin_SVCVT<"svcvt_s64_f16_m", llvm_nxv2i64_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzs_i64f32   : Builtin_SVCVT<"svcvt_s64_f32_m", llvm_nxv2i64_ty, llvm_nxv4f32_ty>;
+
+def int_aarch64_sve_fcvtzu_i32f16   : Builtin_SVCVT<"svcvt_u32_f16_m", llvm_nxv4i32_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzu_i32f64   : Builtin_SVCVT<"svcvt_u32_f64_m", llvm_nxv4i32_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtzu_i64f16   : Builtin_SVCVT<"svcvt_u64_f16_m", llvm_nxv2i64_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtzu_i64f32   : Builtin_SVCVT<"svcvt_u64_f32_m", llvm_nxv2i64_ty, llvm_nxv4f32_ty>;
+
+def int_aarch64_sve_fcvt_f16f32     : Builtin_SVCVT<"svcvt_f16_f32_m", llvm_nxv8f16_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvt_f16f64     : Builtin_SVCVT<"svcvt_f16_f64_m", llvm_nxv8f16_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvt_f32f64     : Builtin_SVCVT<"svcvt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2f64_ty>;
+
+def int_aarch64_sve_fcvt_f32f16     : Builtin_SVCVT<"svcvt_f32_f16_m", llvm_nxv4f32_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvt_f64f16     : Builtin_SVCVT<"svcvt_f64_f16_m", llvm_nxv2f64_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvt_f64f32     : Builtin_SVCVT<"svcvt_f64_f32_m", llvm_nxv2f64_ty, llvm_nxv4f32_ty>;
+
+def int_aarch64_sve_fcvtlt_f32f16   : Builtin_SVCVT<"svcvtlt_f32_f16_m", llvm_nxv4f32_ty, llvm_nxv8f16_ty>;
+def int_aarch64_sve_fcvtlt_f64f32   : Builtin_SVCVT<"svcvtlt_f64_f32_m", llvm_nxv2f64_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtnt_f16f32   : Builtin_SVCVT<"svcvtnt_f16_f32_m", llvm_nxv8f16_ty, llvm_nxv4f32_ty>;
+def int_aarch64_sve_fcvtnt_f32f64   : Builtin_SVCVT<"svcvtnt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2f64_ty>;
+
+def int_aarch64_sve_fcvtx_f32f64    : Builtin_SVCVT<"svcvtx_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2f64_ty>;
+def int_aarch64_sve_fcvtxnt_f32f64  : Builtin_SVCVT<"svcvtxnt_f32_f64_m", llvm_nxv4f32_ty, llvm_nxv2f64_ty>;
+
+def int_aarch64_sve_scvtf_f16i32    : Builtin_SVCVT<"svcvt_f16_s32_m", llvm_nxv8f16_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_scvtf_f16i64    : Builtin_SVCVT<"svcvt_f16_s64_m", llvm_nxv8f16_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_scvtf_f32i64    : Builtin_SVCVT<"svcvt_f32_s64_m", llvm_nxv4f32_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_scvtf_f64i32    : Builtin_SVCVT<"svcvt_f64_s32_m", llvm_nxv2f64_ty, llvm_nxv4i32_ty>;
+
+def int_aarch64_sve_ucvtf_f16i32    : Builtin_SVCVT<"svcvt_f16_u32_m", llvm_nxv8f16_ty, llvm_nxv4i32_ty>;
+def int_aarch64_sve_ucvtf_f16i64    : Builtin_SVCVT<"svcvt_f16_u64_m", llvm_nxv8f16_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_ucvtf_f32i64    : Builtin_SVCVT<"svcvt_f32_u64_m", llvm_nxv4f32_ty, llvm_nxv2i64_ty>;
+def int_aarch64_sve_ucvtf_f64i32    : Builtin_SVCVT<"svcvt_f64_u32_m", llvm_nxv2f64_ty, llvm_nxv4i32_ty>;
 
 //
 // Predicate operations
@@ -1069,4 +1277,56 @@ def int_aarch64_sve_fcvtzs_i32f16 : Builtin_SVCVT<"svcvt_s32_f16_m", llvm_nxv4i3
 
 def int_aarch64_sve_punpkhi : AdvSIMD_SVE_PUNPKHI_Intrinsic;
 def int_aarch64_sve_punpklo : AdvSIMD_SVE_PUNPKHI_Intrinsic;
+
+//
+// Gather loads:
+//
+
+// scalar + vector, 64 bit unscaled offsets
+def int_aarch64_sve_ld1_gather : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
+
+// scalar + vector, 64 bit scaled offsets
+def int_aarch64_sve_ld1_gather_index : AdvSIMD_GatherLoad_64bitOffset_Intrinsic;
+
+//  scalar + vector, 32 bit unscaled offsets, sign (sxtw) or zero (zxtw)
+//  extended to 64 bits
+def int_aarch64_sve_ld1_gather_sxtw : AdvSIMD_GatherLoad_32bitOffset_Intrinsic;
+def int_aarch64_sve_ld1_gather_uxtw : AdvSIMD_GatherLoad_32bitOffset_Intrinsic;
+
+//  scalar + vector, 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended
+//  to 64 bits
+def int_aarch64_sve_ld1_gather_sxtw_index : AdvSIMD_GatherLoad_32bitOffset_Intrinsic;
+def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_32bitOffset_Intrinsic;
+
+// vector base + immediate index
+def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VecTorBase_Intrinsic;
+
+//
+// SVE2 - Non-widening pairwise arithmetic
+//
+
+def int_aarch64_sve_faddp   : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmaxp   : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fmaxnmp : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fminp   : AdvSIMD_Pred2VectorArg_Intrinsic;
+def int_aarch64_sve_fminnmp : AdvSIMD_Pred2VectorArg_Intrinsic;
+
+//
+// SVE2 - Floating-point widening multiply-accumulate
+//
+
+def int_aarch64_sve_fmlalb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_fmlalb_lane   : SVE2_3VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_fmlalt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_fmlalt_lane   : SVE2_3VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_fmlslb        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_fmlslb_lane   : SVE2_3VectorArgIndexed_Long_Intrinsic;
+def int_aarch64_sve_fmlslt        : SVE2_3VectorArg_Long_Intrinsic;
+def int_aarch64_sve_fmlslt_lane   : SVE2_3VectorArgIndexed_Long_Intrinsic;
+
+//
+// SVE2 - Floating-point integer binary logarithm
+//
+
+def int_aarch64_sve_flogb : AdvSIMD_SVE_LOGB_Intrinsic;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 10417411edca2..c4061ea01eeec 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -426,8 +426,6 @@ let IntrProperties = [IntrNoMem, Commutative] in {
   def int_arm_neon_vhaddu : Neon_2Arg_Intrinsic;
   def int_arm_neon_vrhadds : Neon_2Arg_Intrinsic;
   def int_arm_neon_vrhaddu : Neon_2Arg_Intrinsic;
-  def int_arm_neon_vqadds : Neon_2Arg_Intrinsic;
-  def int_arm_neon_vqaddu : Neon_2Arg_Intrinsic;
   def int_arm_neon_vraddhn : Neon_2Arg_Narrow_Intrinsic;
 
   // Vector Multiply.
@@ -459,8 +457,6 @@ let IntrProperties = [IntrNoMem, Commutative] in {
 // Vector Subtract.
 def int_arm_neon_vhsubs : Neon_2Arg_Intrinsic;
 def int_arm_neon_vhsubu : Neon_2Arg_Intrinsic;
-def int_arm_neon_vqsubs : Neon_2Arg_Intrinsic;
-def int_arm_neon_vqsubu : Neon_2Arg_Intrinsic;
 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
 
 // Vector Absolute Compare.
@@ -777,10 +773,15 @@ class Neon_Dot_Intrinsic
 def int_arm_neon_udot : Neon_Dot_Intrinsic;
 def int_arm_neon_sdot : Neon_Dot_Intrinsic;
 
-def int_arm_vctp8  : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp8  : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type
+def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
+
+// v8.3-A Floating-point complex add
+def int_arm_neon_vcadd_rot90  : Neon_2Arg_Intrinsic;
+def int_arm_neon_vcadd_rot270 : Neon_2Arg_Intrinsic;
 
 // GNU eabi mcount
 def int_arm_gnu_eabi_mcount : Intrinsic<[],
@@ -800,12 +801,45 @@ multiclass IntrinsicSignSuffix<list<LLVMType> rets, list<LLVMType> params = [],
   def _u: Intrinsic<rets, params, props, name, sdprops>;
 }
 
+def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_max_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_abd_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
 def int_arm_mve_add_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
+def int_arm_mve_and_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_bic_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_eor_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_orn_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_orr_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
 def int_arm_mve_sub_predicated: Intrinsic<[llvm_anyvector_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
    [IntrNoMem]>;
+def int_arm_mve_mul_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_mulh_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
+def int_arm_mve_rmulh_predicated: Intrinsic<[llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
+   [IntrNoMem]>;
 
 defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty],
    [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
@@ -870,6 +904,9 @@ defm int_arm_mve_sqrshr: ARM_MVE_qrshift<[llvm_i32_ty]>;
 def int_arm_mve_lsll: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
 def int_arm_mve_asrl: ARM_MVE_qrshift_single<[llvm_i32_ty, llvm_i32_ty]>;
 
+def int_arm_mve_vabd: Intrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 def int_arm_mve_vadc: Intrinsic<
    [llvm_anyvector_ty, llvm_i32_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>;
@@ -877,6 +914,12 @@ def int_arm_mve_vadc_predicated: Intrinsic<
    [llvm_anyvector_ty, llvm_i32_ty],
    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
     llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>;
+def int_arm_mve_vmulh: Intrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+def int_arm_mve_vrmulh: Intrinsic<
+   [llvm_anyvector_ty],
+   [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
 
 def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
 def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 5d4ce4955b996..6621fc9f819cd 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -366,7 +366,7 @@ inline api_pred_ty<is_negative> m_Negative(const APInt *&V) {
 struct is_nonnegative {
   bool isValue(const APInt &C) { return C.isNonNegative(); }
 };
-/// Match an integer or vector of nonnegative values.
+/// Match an integer or vector of non-negative values.
 /// For vectors, this includes constants with undefined elements.
 inline cst_pred_ty<is_nonnegative> m_NonNegative() {
   return cst_pred_ty<is_nonnegative>();
@@ -375,6 +375,28 @@ inline api_pred_ty<is_nonnegative> m_NonNegative(const APInt *&V) {
   return V;
 }
 
+struct is_strictlypositive {
+  bool isValue(const APInt &C) { return C.isStrictlyPositive(); }
+};
+/// Match an integer or vector of strictly positive values.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_strictlypositive> m_StrictlyPositive() {
+  return cst_pred_ty<is_strictlypositive>();
+}
+inline api_pred_ty<is_strictlypositive> m_StrictlyPositive(const APInt *&V) {
+  return V;
+}
+
+struct is_nonpositive {
+  bool isValue(const APInt &C) { return C.isNonPositive(); }
+};
+/// Match an integer or vector of non-positive values.
+/// For vectors, this includes constants with undefined elements.
+inline cst_pred_ty<is_nonpositive> m_NonPositive() {
+  return cst_pred_ty<is_nonpositive>();
+}
+inline api_pred_ty<is_nonpositive> m_NonPositive(const APInt *&V) { return V; }
+
 struct is_one {
   bool isValue(const APInt &C) { return C.isOneValue(); }
 };
@@ -1736,6 +1758,12 @@ struct m_Intrinsic_Ty<T0, T1, T2, T3> {
                         Argument_match<T3>>;
 };
 
+template <typename T0, typename T1, typename T2, typename T3, typename T4>
+struct m_Intrinsic_Ty<T0, T1, T2, T3, T4> {
+  using Ty = match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2, T3>::Ty,
+                               Argument_match<T4>>;
+};
+
 /// Match intrinsic calls like this:
 /// m_Intrinsic<Intrinsic::fabs>(m_Value(X))
 template <Intrinsic::ID IntrID> inline IntrinsicID_match m_Intrinsic() {
@@ -1766,6 +1794,15 @@ m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3) {
   return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2), m_Argument<3>(Op3));
 }
 
+template <Intrinsic::ID IntrID, typename T0, typename T1, typename T2,
+          typename T3, typename T4>
+inline typename m_Intrinsic_Ty<T0, T1, T2, T3, T4>::Ty
+m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3,
+            const T4 &Op4) {
+  return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2, Op3),
+                      m_Argument<4>(Op4));
+}
+
 // Helper intrinsic matching specializations.
 template <typename Opnd0>
 inline typename m_Intrinsic_Ty<Opnd0>::Ty m_BitReverse(const Opnd0 &Op0) {
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index dbae32e843936..574cb69360ac0 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -408,6 +408,7 @@ void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
 void initializeThreadSanitizerLegacyPassPass(PassRegistry&);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
+void initializeTypePromotionPass(PassRegistry&);
 void initializeUnifyFunctionExitNodesPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeUnreachableBlockElimLegacyPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h
index 589f1dfe90b81..5a6dff64caef7 100644
--- a/llvm/include/llvm/MC/MCAsmInfo.h
+++ b/llvm/include/llvm/MC/MCAsmInfo.h
@@ -333,6 +333,10 @@ class MCAsmInfo {
   /// protected visibility.  Defaults to MCSA_Protected
   MCSymbolAttr ProtectedVisibilityAttr = MCSA_Protected;
 
+  // This attribute is used to indicate symbols such as commons on AIX may have
+  // a storage mapping class embedded in the name.
+  bool SymbolsHaveSMC = false;
+
   //===--- Dwarf Emission Directives -----------------------------------===//
 
   /// True if target supports emission of debugging information.  Defaults to
@@ -587,6 +591,8 @@ class MCAsmInfo {
     return ProtectedVisibilityAttr;
   }
 
+  bool getSymbolsHaveSMC() const { return SymbolsHaveSMC; }
+
   bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
 
   bool doesSupportExceptionHandling() const {
diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h
index 12d681ffbebc9..2f7f5d64b466d 100644
--- a/llvm/include/llvm/MC/MCObjectFileInfo.h
+++ b/llvm/include/llvm/MC/MCObjectFileInfo.h
@@ -111,6 +111,7 @@ class MCObjectFileInfo {
   MCSection *DwarfLineDWOSection = nullptr;
   MCSection *DwarfLocDWOSection = nullptr;
   MCSection *DwarfStrOffDWOSection = nullptr;
+  MCSection *DwarfMacinfoDWOSection = nullptr;
 
   /// The DWARF v5 string offset and address table sections.
   MCSection *DwarfStrOffSection = nullptr;
@@ -303,6 +304,9 @@ class MCObjectFileInfo {
   MCSection *getDwarfLoclistsDWOSection() const {
     return DwarfLoclistsDWOSection;
   }
+  MCSection *getDwarfMacinfoDWOSection() const {
+    return DwarfMacinfoDWOSection;
+  }
   MCSection *getDwarfCUIndexSection() const { return DwarfCUIndexSection; }
   MCSection *getDwarfTUIndexSection() const { return DwarfTUIndexSection; }
   MCSection *getDwarfSwiftASTSection() const { return DwarfSwiftASTSection; }
diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index c7dc56ea588e9..9280dc75e50bc 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -16,11 +16,13 @@
 #define LLVM_MC_MCREGISTERINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegister.h"
 #include <cassert>
 #include <cstdint>
+#include <iterator>
 #include <utility>
 
 namespace llvm {
@@ -177,6 +179,9 @@ class MCRegisterInfo {
   DenseMap<MCRegister, int> L2CVRegs;         // LLVM to CV regs mapping
 
 public:
+  // Forward declaration to become a friend class of DiffListIterator.
+  template <class SubT> class mc_difflist_iterator;
+
   /// DiffListIterator - Base iterator class that can traverse the
   /// differentially encoded register and regunit lists in DiffLists.
   /// Don't use this class directly, use one of the specialized sub-classes
@@ -220,8 +225,105 @@ class MCRegisterInfo {
       if (!advance())
         List = nullptr;
     }
+
+    template <class SubT> friend class MCRegisterInfo::mc_difflist_iterator;
   };
 
+  /// Forward iterator using DiffListIterator.
+  template <class SubT>
+  class mc_difflist_iterator
+      : public iterator_facade_base<mc_difflist_iterator<SubT>,
+                                    std::forward_iterator_tag, MCPhysReg> {
+    MCRegisterInfo::DiffListIterator Iter;
+    /// Current value as MCPhysReg, so we can return a reference to it.
+    MCPhysReg Val;
+
+  protected:
+    mc_difflist_iterator(MCRegisterInfo::DiffListIterator Iter) : Iter(Iter) {}
+
+    // Allow conversion between instantiations where valid.
+    mc_difflist_iterator(MCRegister Reg, const MCPhysReg *DiffList) {
+      Iter.init(Reg, DiffList);
+      Val = *Iter;
+    }
+
+  public:
+    // Allow default construction to build variables, but this doesn't build
+    // a useful iterator.
+    mc_difflist_iterator() = default;
+
+    /// Return an iterator past the last element.
+    static SubT end() {
+      SubT End;
+      End.Iter.List = nullptr;
+      return End;
+    }
+
+    bool operator==(const mc_difflist_iterator &Arg) const {
+      return Iter.List == Arg.Iter.List;
+    }
+
+    const MCPhysReg &operator*() const { return Val; }
+
+    using mc_difflist_iterator::iterator_facade_base::operator++;
+    void operator++() {
+      assert(Iter.List && "Cannot increment the end iterator!");
+      ++Iter;
+      Val = *Iter;
+    }
+  };
+
+  /// Forward iterator over all sub-registers.
+  /// TODO: Replace remaining uses of MCSubRegIterator.
+  class mc_subreg_iterator : public mc_difflist_iterator<mc_subreg_iterator> {
+  public:
+    mc_subreg_iterator(MCRegisterInfo::DiffListIterator Iter)
+        : mc_difflist_iterator(Iter) {}
+    mc_subreg_iterator() = default;
+    mc_subreg_iterator(MCRegister Reg, const MCRegisterInfo *MCRI)
+        : mc_difflist_iterator(Reg, MCRI->DiffLists + MCRI->get(Reg).SubRegs) {}
+  };
+
+  /// Forward iterator over all super-registers.
+  /// TODO: Replace remaining uses of MCSuperRegIterator.
+  class mc_superreg_iterator
+      : public mc_difflist_iterator<mc_superreg_iterator> {
+  public:
+    mc_superreg_iterator(MCRegisterInfo::DiffListIterator Iter)
+        : mc_difflist_iterator(Iter) {}
+    mc_superreg_iterator() = default;
+    mc_superreg_iterator(MCRegister Reg, const MCRegisterInfo *MCRI)
+        : mc_difflist_iterator(Reg,
+                               MCRI->DiffLists + MCRI->get(Reg).SuperRegs) {}
+  };
+
+  /// Return an iterator range over all sub-registers of \p Reg, excluding \p
+  /// Reg.
+  iterator_range<mc_subreg_iterator> subregs(MCRegister Reg) const {
+    return make_range(std::next(mc_subreg_iterator(Reg, this)),
+                      mc_subreg_iterator::end());
+  }
+
+  /// Return an iterator range over all sub-registers of \p Reg, including \p
+  /// Reg.
+  iterator_range<mc_subreg_iterator> subregs_inclusive(MCRegister Reg) const {
+    return make_range({Reg, this}, mc_subreg_iterator::end());
+  }
+
+  /// Return an iterator range over all super-registers of \p Reg, excluding \p
+  /// Reg.
+  iterator_range<mc_superreg_iterator> superregs(MCRegister Reg) const {
+    return make_range(std::next(mc_superreg_iterator(Reg, this)),
+                      mc_superreg_iterator::end());
+  }
+
+  /// Return an iterator range over all super-registers of \p Reg, including \p
+  /// Reg.
+  iterator_range<mc_superreg_iterator>
+  superregs_inclusive(MCRegister Reg) const {
+    return make_range({Reg, this}, mc_superreg_iterator::end());
+  }
+
   // These iterators are allowed to sub-class DiffListIterator and access
   // internal list pointers.
   friend class MCSubRegIterator;
diff --git a/llvm/include/llvm/MC/MCSymbolXCOFF.h b/llvm/include/llvm/MC/MCSymbolXCOFF.h
index 8bc7817404392..07dfb5d299776 100644
--- a/llvm/include/llvm/MC/MCSymbolXCOFF.h
+++ b/llvm/include/llvm/MC/MCSymbolXCOFF.h
@@ -9,6 +9,7 @@
 #define LLVM_MC_MCSYMBOLXCOFF_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCSymbol.h"
 
@@ -50,6 +51,17 @@ class MCSymbolXCOFF : public MCSymbol {
 
   bool hasContainingCsect() const { return ContainingCsect != nullptr; }
 
+  StringRef getUnqualifiedName() const {
+    const StringRef name = getName();
+    if (name.back() == ']') {
+      StringRef lhs, rhs;
+      std::tie(lhs, rhs) = name.rsplit('[');
+      assert(!rhs.empty() && "Invalid SMC format in XCOFF symbol.");
+      return lhs;
+    }
+    return name;
+  }
+
 private:
   Optional<XCOFF::StorageClass> StorageClass;
   MCSectionXCOFF *ContainingCsect = nullptr;
diff --git a/llvm/include/llvm/Object/ELF.h b/llvm/include/llvm/Object/ELF.h
index dc848cee7e574..42c5b67ac3fa8 100644
--- a/llvm/include/llvm/Object/ELF.h
+++ b/llvm/include/llvm/Object/ELF.h
@@ -402,12 +402,17 @@ ELFFile<ELFT>::getSectionContentsAsArray(const Elf_Shdr *Sec) const {
                        " has an invalid sh_size (" + Twine(Size) +
                        ") which is not a multiple of its sh_entsize (" +
                        Twine(Sec->sh_entsize) + ")");
-  if ((std::numeric_limits<uintX_t>::max() - Offset < Size) ||
-      Offset + Size > Buf.size())
+  if (std::numeric_limits<uintX_t>::max() - Offset < Size)
     return createError("section " + getSecIndexForError(this, Sec) +
                        " has a sh_offset (0x" + Twine::utohexstr(Offset) +
                        ") + sh_size (0x" + Twine::utohexstr(Size) +
                        ") that cannot be represented");
+  if (Offset + Size > Buf.size())
+    return createError("section " + getSecIndexForError(this, Sec) +
+                       " has a sh_offset (0x" + Twine::utohexstr(Offset) +
+                       ") + sh_size (0x" + Twine::utohexstr(Size) +
+                       ") that is greater than the file size (0x" +
+                       Twine::utohexstr(Buf.size()) + ")");
 
   if (Offset % alignof(T))
     // TODO: this error is untested.
diff --git a/llvm/include/llvm/ObjectYAML/ELFYAML.h b/llvm/include/llvm/ObjectYAML/ELFYAML.h
index a498621a2a13f..7e0244ed08e7d 100644
--- a/llvm/include/llvm/ObjectYAML/ELFYAML.h
+++ b/llvm/include/llvm/ObjectYAML/ELFYAML.h
@@ -324,7 +324,8 @@ struct VerneedEntry {
 };
 
 struct VerneedSection : Section {
-  std::vector<VerneedEntry> VerneedV;
+  Optional<yaml::BinaryRef> Content;
+  Optional<std::vector<VerneedEntry>> VerneedV;
   llvm::yaml::Hex64 Info;
 
   VerneedSection() : Section(ChunkKind::Verneed) {}
@@ -397,7 +398,9 @@ struct VerdefEntry {
 };
 
 struct VerdefSection : Section {
-  std::vector<VerdefEntry> Entries;
+  Optional<std::vector<VerdefEntry>> Entries;
+  Optional<yaml::BinaryRef> Content;
+
   llvm::yaml::Hex64 Info;
 
   VerdefSection() : Section(ChunkKind::Verdef) {}
@@ -478,7 +481,7 @@ struct Object {
   // top-level key, which automatically ensures that invariants like there
   // being a single SHT_SYMTAB section are upheld.
   Optional<std::vector<Symbol>> Symbols;
-  std::vector<Symbol> DynamicSymbols;
+  Optional<std::vector<Symbol>> DynamicSymbols;
 
   std::vector<Section *> getSections() {
     std::vector<Section *> Ret;
diff --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 63784463e1718..6c0bb6c2fc3ad 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -20,6 +20,8 @@
 #define LLVM_SUPPORT_COMMANDLINE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -29,6 +31,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <climits>
@@ -1831,7 +1834,7 @@ void PrintHelpMessage(bool Hidden = false, bool Categorized = false);
 //
 
 /// Use this to get a StringMap to all registered named options
-/// (e.g. -help). Note \p Map Should be an empty StringMap.
+/// (e.g. -help).
 ///
 /// \return A reference to the StringMap used by the cl APIs to parse options.
 ///
@@ -1964,10 +1967,16 @@ bool readConfigFile(StringRef CfgFileName, StringSaver &Saver,
 /// with nullptrs in the Argv vector.
 /// \param [in] RelativeNames true if names of nested response files must be
 /// resolved relative to including file.
+/// \param [in] FS File system used for all file access when running the tool.
+/// \param [in] CurrentDir Path used to resolve relative rsp files. If set to
+/// None, process' cwd is used instead.
 /// \return true if all @files were expanded successfully or there were none.
-bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
-                         SmallVectorImpl<const char *> &Argv,
-                         bool MarkEOLs = false, bool RelativeNames = false);
+bool ExpandResponseFiles(
+    StringSaver &Saver, TokenizerCallback Tokenizer,
+    SmallVectorImpl<const char *> &Argv, bool MarkEOLs = false,
+    bool RelativeNames = false,
+    llvm::vfs::FileSystem &FS = *llvm::vfs::getRealFileSystem(),
+    llvm::Optional<llvm::StringRef> CurrentDir = llvm::None);
 
 /// Mark all options not part of this category as cl::ReallyHidden.
 ///
diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h
index 07fd94e29a1fb..df0b02c1335d4 100644
--- a/llvm/include/llvm/Support/KnownBits.h
+++ b/llvm/include/llvm/Support/KnownBits.h
@@ -107,6 +107,18 @@ struct KnownBits {
     Zero.setSignBit();
   }
 
+  /// Return the minimal value possible given these KnownBits.
+  APInt getMinValue() const {
+    // Assume that all bits that aren't known-ones are zeros.
+    return One;
+  }
+
+  /// Return the maximal value possible given these KnownBits.
+  APInt getMaxValue() const {
+    // Assume that all bits that aren't known-zeros are ones.
+    return ~Zero;
+  }
+
   /// Truncate the underlying known Zero and One bits. This is equivalent
   /// to truncating the value we're tracking.
   KnownBits trunc(unsigned BitWidth) const {
diff --git a/llvm/include/llvm/Support/LowLevelTypeImpl.h b/llvm/include/llvm/Support/LowLevelTypeImpl.h
index 0e02b6e7d750a..6ef7c298bc28f 100644
--- a/llvm/include/llvm/Support/LowLevelTypeImpl.h
+++ b/llvm/include/llvm/Support/LowLevelTypeImpl.h
@@ -137,6 +137,8 @@ class LLT {
                       : LLT::scalar(NewEltSize);
   }
 
+  bool isByteSized() const { return (getSizeInBits() & 7) == 0; }
+
   unsigned getScalarSizeInBits() const {
     assert(RawData != 0 && "Invalid Type");
     if (!IsVector) {
diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h
index 488f17427fd7f..97955f882d51e 100644
--- a/llvm/include/llvm/Support/Path.h
+++ b/llvm/include/llvm/Support/Path.h
@@ -152,18 +152,33 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
 ///
 /// @code
 ///   /foo, /old, /new => /foo
+///   /old, /old, /new => /new
+///   /old, /old/, /new, false => /old
+///   /old, /old/, /new, true => /new
 ///   /old/foo, /old, /new => /new/foo
+///   /old/foo, /old/, /new => /new/foo
+///   /old/foo, /old/, /new/ => /new/foo
+///   /oldfoo, /old, /new => /oldfoo
 ///   /foo, <empty>, /new => /new/foo
-///   /old/foo, /old, <empty> => /foo
+///   /foo, <empty>, new => new/foo
+///   /old/foo, /old, <empty>, false => /foo
+///   /old/foo, /old, <empty>, true => foo
 /// @endcode
 ///
 /// @param Path If \a Path starts with \a OldPrefix modify to instead
 ///        start with \a NewPrefix.
-/// @param OldPrefix The path prefix to strip from \a Path.
+/// @param OldPrefix The path prefix to strip from \a Path. Any trailing
+///        path separator is ignored if strict is true.
 /// @param NewPrefix The path prefix to replace \a NewPrefix with.
-void replace_path_prefix(SmallVectorImpl<char> &Path,
+/// @param style The path separator style
+/// @param strict If strict is true, a directory separator following
+///        \a OldPrefix will also be stripped. Otherwise, directory
+///        separators will only be matched and stripped when present
+///        in \a OldPrefix.
+/// @result true if \a Path begins with OldPrefix
+bool replace_path_prefix(SmallVectorImpl<char> &Path,
                          const StringRef &OldPrefix, const StringRef &NewPrefix,
-                         Style style = Style::native);
+                         Style style = Style::native, bool strict = false);
 
 /// Append to path.
 ///
diff --git a/llvm/include/llvm/Support/TimeProfiler.h b/llvm/include/llvm/Support/TimeProfiler.h
index 8cc430d0bc727..2b51bba0e7f86 100644
--- a/llvm/include/llvm/Support/TimeProfiler.h
+++ b/llvm/include/llvm/Support/TimeProfiler.h
@@ -19,7 +19,8 @@ extern TimeTraceProfiler *TimeTraceProfilerInstance;
 /// Initialize the time trace profiler.
 /// This sets up the global \p TimeTraceProfilerInstance
 /// variable to be the profiler instance.
-void timeTraceProfilerInitialize(unsigned TimeTraceGranularity);
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
+                                 StringRef ProcName);
 
 /// Cleanup the time trace profiler, if it was initialized.
 void timeTraceProfilerCleanup();
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index c395e5bcecf17..d1db4eceabb88 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -107,7 +107,7 @@ namespace llvm {
   public:
     TargetOptions()
         : PrintMachineCode(false), UnsafeFPMath(false), NoInfsFPMath(false),
-          NoNaNsFPMath(false), NoTrappingFPMath(false),
+          NoNaNsFPMath(false), NoTrappingFPMath(true),
           NoSignedZerosFPMath(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 441f3d7d118d1..9543086c4da72 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -224,13 +224,13 @@ def SDTIStore : SDTypeProfile<1, 3, [       // indexed store
   SDTCisSameAs<0, 2>, SDTCisPtrTy<0>, SDTCisPtrTy<3>
 ]>;
 
-def SDTMaskedStore: SDTypeProfile<0, 3, [       // masked store
-  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+def SDTMaskedStore: SDTypeProfile<0, 4, [       // masked store
+  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameNumEltsAs<0, 3>
 ]>;
 
-def SDTMaskedLoad: SDTypeProfile<1, 3, [       // masked load
-  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
-  SDTCisSameNumEltsAs<0, 2>
+def SDTMaskedLoad: SDTypeProfile<1, 4, [       // masked load
+  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisSameAs<0, 4>,
+  SDTCisSameNumEltsAs<0, 3>
 ]>;
 
 def SDTVecShuffle : SDTypeProfile<1, 2, [
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index a75a047b7fd0d..4f6f823a230b7 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1820,6 +1820,42 @@ struct DerefState : AbstractState {
   /// State representing for dereferenceable bytes.
   IncIntegerState<> DerefBytesState;
 
+  /// Map representing for accessed memory offsets and sizes.
+  /// A key is Offset and a value is size.
+  /// If there is a load/store instruction something like,
+  ///   p[offset] = v;
+  /// (offset, sizeof(v)) will be inserted to this map.
+  /// std::map is used because we want to iterate keys in ascending order.
+  std::map<int64_t, uint64_t> AccessedBytesMap;
+
+  /// Helper function to calculate dereferenceable bytes from current known
+  /// bytes and accessed bytes.
+  ///
+  /// int f(int *A){
+  ///    *A = 0;
+  ///    *(A+2) = 2;
+  ///    *(A+1) = 1;
+  ///    *(A+10) = 10;
+  /// }
+  /// ```
+  /// In that case, AccessedBytesMap is `{0:4, 4:4, 8:4, 40:4}`.
+  /// AccessedBytesMap is std::map so it is iterated in accending order on
+  /// key(Offset). So KnownBytes will be updated like this: |Access | KnownBytes
+  /// |(0, 4)| 0 -> 4
+  /// |(4, 4)| 4 -> 8
+  /// |(8, 4)| 8 -> 12
+  /// |(40, 4) | 12 (break)
+  void computeKnownDerefBytesFromAccessedMap() {
+    int64_t KnownBytes = DerefBytesState.getKnown();
+    for (auto &Access : AccessedBytesMap) {
+      if (KnownBytes < Access.first)
+        break;
+      KnownBytes = std::max(KnownBytes, Access.first + (int64_t)Access.second);
+    }
+
+    DerefBytesState.takeKnownMaximum(KnownBytes);
+  }
+
   /// State representing that whether the value is globaly dereferenceable.
   BooleanState GlobalState;
 
@@ -1849,6 +1885,9 @@ struct DerefState : AbstractState {
   /// Update known dereferenceable bytes.
   void takeKnownDerefBytesMaximum(uint64_t Bytes) {
     DerefBytesState.takeKnownMaximum(Bytes);
+
+    // Known bytes might increase.
+    computeKnownDerefBytesFromAccessedMap();
   }
 
   /// Update assumed dereferenceable bytes.
@@ -1856,6 +1895,14 @@ struct DerefState : AbstractState {
     DerefBytesState.takeAssumedMinimum(Bytes);
   }
 
+  /// Add accessed bytes to the map.
+  void addAccessedBytes(int64_t Offset, uint64_t Size) {
+    AccessedBytesMap[Offset] = std::max(AccessedBytesMap[Offset], Size);
+
+    // Known bytes might increase.
+    computeKnownDerefBytesFromAccessedMap();
+  }
+
   /// Equality for DerefState.
   bool operator==(const DerefState &R) {
     return this->DerefBytesState == R.DerefBytesState &&
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 88c2ef787ad81..610668adcfa55 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -175,6 +175,7 @@ class LibCallSimplifier {
   Value *optimizeMemCmp(CallInst *CI, IRBuilder<> &B);
   Value *optimizeBCmp(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemCmpBCmpCommon(CallInst *CI, IRBuilder<> &B);
+  Value *optimizeMemCCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemPCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemCpy(CallInst *CI, IRBuilder<> &B);
   Value *optimizeMemMove(CallInst *CI, IRBuilder<> &B);
diff --git a/llvm/include/llvm/Transforms/Utils/SizeOpts.h b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
index 4614007a64581..ba0f86c452637 100644
--- a/llvm/include/llvm/Transforms/Utils/SizeOpts.h
+++ b/llvm/include/llvm/Transforms/Utils/SizeOpts.h
@@ -21,6 +21,7 @@ using namespace llvm;
 
 extern cl::opt<bool> EnablePGSO;
 extern cl::opt<bool> PGSOLargeWorkingSetSizeOnly;
+extern cl::opt<bool> PGSOIRPassOrTestOnly;
 extern cl::opt<bool> PGSOColdCodeOnly;
 extern cl::opt<bool> ForcePGSO;
 extern cl::opt<int> PgsoCutoffInstrProf;
@@ -33,9 +34,15 @@ class BlockFrequencyInfo;
 class Function;
 class ProfileSummaryInfo;
 
+enum class PGSOQueryType {
+  IRPass, // A query call from an IR-level transform pass.
+  Test,   // A query call from a unit test.
+  Other,  // Others.
+};
+
 template<typename AdapterT, typename FuncT, typename BFIT>
 bool shouldFuncOptimizeForSizeImpl(const FuncT *F, ProfileSummaryInfo *PSI,
-                                   BFIT *BFI) {
+                                   BFIT *BFI, PGSOQueryType QueryType) {
   assert(F);
   if (!PSI || !BFI || !PSI->hasProfileSummary())
     return false;
@@ -43,6 +50,11 @@ bool shouldFuncOptimizeForSizeImpl(const FuncT *F, ProfileSummaryInfo *PSI,
     return true;
   if (!EnablePGSO)
     return false;
+  // Temporarily enable size optimizations only for the IR pass or test query
+  // sites for gradual commit/rollout. This is to be removed later.
+  if (PGSOIRPassOrTestOnly && !(QueryType == PGSOQueryType::IRPass ||
+                                QueryType == PGSOQueryType::Test))
+    return false;
   if (PGSOColdCodeOnly ||
       (PGSOLargeWorkingSetSizeOnly && !PSI->hasLargeWorkingSetSize())) {
     // Even if the working set size isn't large, size-optimize cold code.
@@ -55,7 +67,7 @@ bool shouldFuncOptimizeForSizeImpl(const FuncT *F, ProfileSummaryInfo *PSI,
 
 template<typename AdapterT, typename BlockT, typename BFIT>
 bool shouldOptimizeForSizeImpl(const BlockT *BB, ProfileSummaryInfo *PSI,
-                               BFIT *BFI) {
+                               BFIT *BFI, PGSOQueryType QueryType) {
   assert(BB);
   if (!PSI || !BFI || !PSI->hasProfileSummary())
     return false;
@@ -63,6 +75,11 @@ bool shouldOptimizeForSizeImpl(const BlockT *BB, ProfileSummaryInfo *PSI,
     return true;
   if (!EnablePGSO)
     return false;
+  // Temporarily enable size optimizations only for the IR pass or test query
+  // sites for gradual commit/rollout. This is to be removed later.
+  if (PGSOIRPassOrTestOnly && !(QueryType == PGSOQueryType::IRPass ||
+                                QueryType == PGSOQueryType::Test))
+    return false;
   if (PGSOColdCodeOnly ||
       (PGSOLargeWorkingSetSizeOnly && !PSI->hasLargeWorkingSetSize())) {
     // Even if the working set size isn't large, size-optimize cold code.
@@ -73,15 +90,17 @@ bool shouldOptimizeForSizeImpl(const BlockT *BB, ProfileSummaryInfo *PSI,
       BB, PSI, BFI);
 }
 
-/// Returns true if function \p F is suggested to be size-optimized base on the
+/// Returns true if function \p F is suggested to be size-optimized based on the
 /// profile.
 bool shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
-                           BlockFrequencyInfo *BFI);
+                           BlockFrequencyInfo *BFI,
+                           PGSOQueryType QueryType = PGSOQueryType::Other);
 
-/// Returns true if basic block \p BB is suggested to be size-optimized base
-/// on the profile.
+/// Returns true if basic block \p BB is suggested to be size-optimized based on
+/// the profile.
 bool shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
-                           BlockFrequencyInfo *BFI);
+                           BlockFrequencyInfo *BFI,
+                           PGSOQueryType QueryType = PGSOQueryType::Other);
 
 } // end namespace llvm
 
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 7bd237b9ad537..ffba65b5ed5ee 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -146,69 +147,83 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-/// Add \p BB to PostDominatedByUnreachable set if applicable.
-void
-BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
-  const Instruction *TI = BB->getTerminator();
-  if (TI->getNumSuccessors() == 0) {
-    if (isa<UnreachableInst>(TI) ||
-        // If this block is terminated by a call to
-        // @llvm.experimental.deoptimize then treat it like an unreachable since
-        // the @llvm.experimental.deoptimize call is expected to practically
-        // never execute.
-        BB->getTerminatingDeoptimizeCall())
-      PostDominatedByUnreachable.insert(BB);
-    return;
-  }
+static void UpdatePDTWorklist(const BasicBlock *BB, PostDominatorTree *PDT,
+                              SmallVectorImpl<const BasicBlock *> &WorkList,
+                              SmallPtrSetImpl<const BasicBlock *> &TargetSet) {
+  SmallVector<BasicBlock *, 8> Descendants;
+  SmallPtrSet<const BasicBlock *, 16> NewItems;
+
+  PDT->getDescendants(const_cast<BasicBlock *>(BB), Descendants);
+  for (auto *BB : Descendants)
+    if (TargetSet.insert(BB).second)
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+        if (!TargetSet.count(*PI))
+          NewItems.insert(*PI);
+  WorkList.insert(WorkList.end(), NewItems.begin(), NewItems.end());
+}
 
-  // If the terminator is an InvokeInst, check only the normal destination block
-  // as the unwind edge of InvokeInst is also very unlikely taken.
-  if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    if (PostDominatedByUnreachable.count(II->getNormalDest()))
-      PostDominatedByUnreachable.insert(BB);
-    return;
+/// Compute a set of basic blocks that are post-dominated by unreachables.
+void BranchProbabilityInfo::computePostDominatedByUnreachable(
+    const Function &F, PostDominatorTree *PDT) {
+  SmallVector<const BasicBlock *, 8> WorkList;
+  for (auto &BB : F) {
+    const Instruction *TI = BB.getTerminator();
+    if (TI->getNumSuccessors() == 0) {
+      if (isa<UnreachableInst>(TI) ||
+          // If this block is terminated by a call to
+          // @llvm.experimental.deoptimize then treat it like an unreachable
+          // since the @llvm.experimental.deoptimize call is expected to
+          // practically never execute.
+          BB.getTerminatingDeoptimizeCall())
+        UpdatePDTWorklist(&BB, PDT, WorkList, PostDominatedByUnreachable);
+    }
   }
 
-  for (auto *I : successors(BB))
-    // If any of successor is not post dominated then BB is also not.
-    if (!PostDominatedByUnreachable.count(I))
-      return;
-
-  PostDominatedByUnreachable.insert(BB);
+  while (!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
+    if (PostDominatedByUnreachable.count(BB))
+      continue;
+    // If the terminator is an InvokeInst, check only the normal destination
+    // block as the unwind edge of InvokeInst is also very unlikely taken.
+    if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      if (PostDominatedByUnreachable.count(II->getNormalDest()))
+        UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByUnreachable);
+    }
+    // If all the successors are unreachable, BB is unreachable as well.
+    else if (!successors(BB).empty() &&
+             llvm::all_of(successors(BB), [this](const BasicBlock *Succ) {
+               return PostDominatedByUnreachable.count(Succ);
+             }))
+      UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByUnreachable);
+  }
 }
 
-/// Add \p BB to PostDominatedByColdCall set if applicable.
-void
-BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
-  assert(!PostDominatedByColdCall.count(BB));
-  const Instruction *TI = BB->getTerminator();
-  if (TI->getNumSuccessors() == 0)
-    return;
+/// compute a set of basic blocks that are post-dominated by ColdCalls.
+void BranchProbabilityInfo::computePostDominatedByColdCall(
+    const Function &F, PostDominatorTree *PDT) {
+  SmallVector<const BasicBlock *, 8> WorkList;
+  for (auto &BB : F)
+    for (auto &I : BB)
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->hasFnAttr(Attribute::Cold))
+          UpdatePDTWorklist(&BB, PDT, WorkList, PostDominatedByColdCall);
 
-  // If all of successor are post dominated then BB is also done.
-  if (llvm::all_of(successors(BB), [&](const BasicBlock *SuccBB) {
-        return PostDominatedByColdCall.count(SuccBB);
-      })) {
-    PostDominatedByColdCall.insert(BB);
-    return;
-  }
+  while (!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
 
-  // If the terminator is an InvokeInst, check only the normal destination
-  // block as the unwind edge of InvokeInst is also very unlikely taken.
-  if (auto *II = dyn_cast<InvokeInst>(TI))
-    if (PostDominatedByColdCall.count(II->getNormalDest())) {
-      PostDominatedByColdCall.insert(BB);
-      return;
+    // If the terminator is an InvokeInst, check only the normal destination
+    // block as the unwind edge of InvokeInst is also very unlikely taken.
+    if (auto *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      if (PostDominatedByColdCall.count(II->getNormalDest()))
+        UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByColdCall);
     }
-
-  // Otherwise, if the block itself contains a cold function, add it to the
-  // set of blocks post-dominated by a cold call.
-  for (auto &I : *BB)
-    if (const CallInst *CI = dyn_cast<CallInst>(&I))
-      if (CI->hasFnAttr(Attribute::Cold)) {
-        PostDominatedByColdCall.insert(BB);
-        return;
-      }
+    // If all of successor are post dominated then BB is also done.
+    else if (!successors(BB).empty() &&
+             llvm::all_of(successors(BB), [this](const BasicBlock *Succ) {
+               return PostDominatedByColdCall.count(Succ);
+             }))
+      UpdatePDTWorklist(BB, PDT, WorkList, PostDominatedByColdCall);
+  }
 }
 
 /// Calculate edge weights for successors lead to unreachable.
@@ -983,13 +998,16 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
     LLVM_DEBUG(dbgs() << "\n");
   }
 
+  std::unique_ptr<PostDominatorTree> PDT =
+      std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
+  computePostDominatedByUnreachable(F, PDT.get());
+  computePostDominatedByColdCall(F, PDT.get());
+
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
   for (auto BB : post_order(&F.getEntryBlock())) {
     LLVM_DEBUG(dbgs() << "Computing probabilities for " << BB->getName()
                       << "\n");
-    updatePostDominatedByUnreachable(BB);
-    updatePostDominatedByColdCall(BB);
     // If there is no at least two successors, no sense to set probability.
     if (BB->getTerminator()->getNumSuccessors() < 2)
       continue;
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 82ccea06f28b6..90ce13e6f6503 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -9,7 +9,9 @@
 // The implementation for the data dependence graph.
 //===----------------------------------------------------------------------===//
 #include "llvm/Analysis/DDG.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -179,19 +181,28 @@ using BasicBlockListType = SmallVector<BasicBlock *, 8>;
 
 DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D)
     : DependenceGraphInfo(F.getName().str(), D) {
+  // Put the basic blocks in program order for correct dependence
+  // directions.
   BasicBlockListType BBList;
-  for (auto &BB : F.getBasicBlockList())
-    BBList.push_back(&BB);
+  for (auto &SCC : make_range(scc_begin(&F), scc_end(&F)))
+    for (BasicBlock * BB : SCC)
+      BBList.push_back(BB);
+  std::reverse(BBList.begin(), BBList.end());
   DDGBuilder(*this, D, BBList).populate();
 }
 
-DataDependenceGraph::DataDependenceGraph(const Loop &L, DependenceInfo &D)
+DataDependenceGraph::DataDependenceGraph(Loop &L, LoopInfo &LI,
+                                         DependenceInfo &D)
     : DependenceGraphInfo(Twine(L.getHeader()->getParent()->getName() + "." +
                                 L.getHeader()->getName())
                               .str(),
                           D) {
+  // Put the basic blocks in program order for correct dependence
+  // directions.
+  LoopBlocksDFS DFS(&L);
+  DFS.perform(&LI);
   BasicBlockListType BBList;
-  for (BasicBlock *BB : L.blocks())
+  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
     BBList.push_back(BB);
   DDGBuilder(*this, D, BBList).populate();
 }
@@ -259,7 +270,7 @@ DDGAnalysis::Result DDGAnalysis::run(Loop &L, LoopAnalysisManager &AM,
                                      LoopStandardAnalysisResults &AR) {
   Function *F = L.getHeader()->getParent();
   DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
-  return std::make_unique<DataDependenceGraph>(L, DI);
+  return std::make_unique<DataDependenceGraph>(L, AR.LI, DI);
 }
 AnalysisKey DDGAnalysis::Key;
 
diff --git a/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
index 115f5d6e814b8..98bb09d792b23 100644
--- a/llvm/lib/Analysis/DependenceGraphBuilder.cpp
+++ b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
@@ -353,5 +353,34 @@ void AbstractDependenceGraphBuilder<G>::createMemoryDependencyEdges() {
   }
 }
 
+template <class G>
+void AbstractDependenceGraphBuilder<G>::sortNodesTopologically() {
+
+  // If we don't create pi-blocks, then we may not have a DAG.
+  if (!shouldCreatePiBlocks())
+    return;
+
+  SmallVector<NodeType *, 64> NodesInPO;
+  using NodeKind = typename NodeType::NodeKind;
+  for (NodeType *N : post_order(&Graph)) {
+    if (N->getKind() == NodeKind::PiBlock) {
+      // Put members of the pi-block right after the pi-block itself, for
+      // convenience.
+      const NodeListType &PiBlockMembers = getNodesInPiBlock(*N);
+      NodesInPO.insert(NodesInPO.end(), PiBlockMembers.begin(),
+                       PiBlockMembers.end());
+    }
+    NodesInPO.push_back(N);
+  }
+
+  size_t OldSize = Graph.Nodes.size();
+  Graph.Nodes.clear();
+  for (NodeType *N : reverse(NodesInPO))
+    Graph.Nodes.push_back(N);
+  if (Graph.Nodes.size() != OldSize)
+    assert(false &&
+           "Expected the number of nodes to stay the same after the sort");
+}
+
 template class llvm::AbstractDependenceGraphBuilder<DataDependenceGraph>;
 template class llvm::DependenceGraphInfo<DDGNode>;
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index ce99226087fa2..3c33aa973cdd6 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -721,6 +721,13 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
     if (I->getParent()->getTerminator() == I)
       return false;
 
+    // Do not try to sink an instruction multiple times (if multiple operands
+    // are first order recurrences).
+    // TODO: We can support this case, by sinking the instruction after the
+    // 'deepest' previous instruction.
+    if (SinkAfter.find(I) != SinkAfter.end())
+      return false;
+
     if (DT->dominates(Previous, I)) // We already are good w/o sinking.
       return true;
 
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 1ba03de69890b..55ce940bc3a5e 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -51,7 +51,7 @@ static cl::opt<int> InlineThreshold(
     cl::desc("Control the amount of inlining to perform (default = 225)"));
 
 static cl::opt<int> HintThreshold(
-    "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore, 
+    "inlinehint-threshold", cl::Hidden, cl::init(325), cl::ZeroOrMore,
     cl::desc("Threshold for inlining functions with inline hint"));
 
 static cl::opt<int>
@@ -63,7 +63,7 @@ static cl::opt<int>
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
 static cl::opt<int> ColdThreshold(
-    "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore, 
+    "inlinecold-threshold", cl::Hidden, cl::init(45), cl::ZeroOrMore,
     cl::desc("Threshold for inlining functions with cold attribute"));
 
 static cl::opt<int>
@@ -149,6 +149,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool HasUninlineableIntrinsic = false;
   bool InitsVargArgs = false;
 
+  /// Attempt to evaluate indirect calls to boost its inline cost.
+  bool BoostIndirectCalls;
+
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize = 0;
   unsigned NumInstructions = 0;
@@ -295,13 +298,14 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
-               Function &Callee, CallBase &Call, const InlineParams &Params)
+               Function &Callee, CallBase &Call, const InlineParams &Params,
+               bool BoostIndirect = true)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
         ComputeFullInlineCost(OptComputeFullInlineCost ||
                               Params.ComputeFullInlineCost || ORE),
-        EnableLoadElimination(true) {}
+        BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {}
 
   InlineResult analyzeCall(CallBase &Call);
 
@@ -423,9 +427,9 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
   Operands.push_back(GEP.getOperand(0));
   for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
     if (Constant *SimpleOp = SimplifiedValues.lookup(*I))
-       Operands.push_back(SimpleOp);
-     else
-       Operands.push_back(*I);
+      Operands.push_back(SimpleOp);
+    else
+      Operands.push_back(*I);
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
 }
 
@@ -1239,97 +1243,93 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
   if (isa<CallInst>(Call) && cast<CallInst>(Call).cannotDuplicate())
     ContainsNoDuplicateCall = true;
 
-  if (Function *F = Call.getCalledFunction()) {
-    // When we have a concrete function, first try to simplify it directly.
-    if (simplifyCallSite(F, Call))
-      return true;
-
-    // Next check if it is an intrinsic we know about.
-    // FIXME: Lift this into part of the InstVisitor.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
-      switch (II->getIntrinsicID()) {
-      default:
-        if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
-          disableLoadElimination();
-        return Base::visitCallBase(Call);
-
-      case Intrinsic::load_relative:
-        // This is normally lowered to 4 LLVM instructions.
-        addCost(3 * InlineConstants::InstrCost);
-        return false;
+  Value *Callee = Call.getCalledOperand();
+  Function *F = dyn_cast_or_null<Function>(Callee);
+  bool IsIndirectCall = !F;
+  if (IsIndirectCall) {
+    // Check if this happens to be an indirect function call to a known function
+    // in this inline context. If not, we've done all we can.
+    F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
+    if (!F) {
+      // Pay the price of the argument setup. We account for the average 1
+      // instruction per call argument setup here.
+      addCost(Call.arg_size() * InlineConstants::InstrCost);
 
-      case Intrinsic::memset:
-      case Intrinsic::memcpy:
-      case Intrinsic::memmove:
+      if (!Call.onlyReadsMemory())
         disableLoadElimination();
-        // SROA can usually chew through these intrinsics, but they aren't free.
-        return false;
-      case Intrinsic::icall_branch_funnel:
-      case Intrinsic::localescape:
-        HasUninlineableIntrinsic = true;
-        return false;
-      case Intrinsic::vastart:
-        InitsVargArgs = true;
-        return false;
-      }
+      return Base::visitCallBase(Call);
     }
+  }
 
-    if (F == Call.getFunction()) {
-      // This flag will fully abort the analysis, so don't bother with anything
-      // else.
-      IsRecursiveCall = true;
-      return false;
-    }
+  assert(F && "Expected a call to a known function");
 
-    if (TTI.isLoweredToCall(F)) {
-      // We account for the average 1 instruction per call argument setup
-      // here.
-      addCost(Call.arg_size() * InlineConstants::InstrCost);
+  // When we have a concrete function, first try to simplify it directly.
+  if (simplifyCallSite(F, Call))
+    return true;
 
-      // Everything other than inline ASM will also have a significant cost
-      // merely from making the call.
-      if (!isa<InlineAsm>(Call.getCalledValue()))
-        addCost(InlineConstants::CallPenalty);
-    }
+  // Next check if it is an intrinsic we know about.
+  // FIXME: Lift this into part of the InstVisitor.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Call)) {
+    switch (II->getIntrinsicID()) {
+    default:
+      if (!Call.onlyReadsMemory() && !isAssumeLikeIntrinsic(II))
+        disableLoadElimination();
+      return Base::visitCallBase(Call);
+
+    case Intrinsic::load_relative:
+      // This is normally lowered to 4 LLVM instructions.
+      addCost(3 * InlineConstants::InstrCost);
+      return false;
 
-    if (!Call.onlyReadsMemory())
+    case Intrinsic::memset:
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
       disableLoadElimination();
-    return Base::visitCallBase(Call);
+      // SROA can usually chew through these intrinsics, but they aren't free.
+      return false;
+    case Intrinsic::icall_branch_funnel:
+    case Intrinsic::localescape:
+      HasUninlineableIntrinsic = true;
+      return false;
+    case Intrinsic::vastart:
+      InitsVargArgs = true;
+      return false;
+    }
   }
 
-  // Otherwise we're in a very special case -- an indirect function call. See
-  // if we can be particularly clever about this.
-  Value *Callee = Call.getCalledValue();
-
-  // First, pay the price of the argument setup. We account for the average
-  // 1 instruction per call argument setup here.
-  addCost(Call.arg_size() * InlineConstants::InstrCost);
-
-  // Next, check if this happens to be an indirect function call to a known
-  // function in this inline context. If not, we've done all we can.
-  Function *F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
-  if (!F) {
-    if (!Call.onlyReadsMemory())
-      disableLoadElimination();
-    return Base::visitCallBase(Call);
+  if (F == Call.getFunction()) {
+    // This flag will fully abort the analysis, so don't bother with anything
+    // else.
+    IsRecursiveCall = true;
+    return false;
   }
 
-  // If we have a constant that we are calling as a function, we can peer
-  // through it and see the function target. This happens not infrequently
-  // during devirtualization and so we want to give it a hefty bonus for
-  // inlining, but cap that bonus in the event that inlining wouldn't pan
-  // out. Pretend to inline the function, with a custom threshold.
-  auto IndirectCallParams = Params;
-  IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
-  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
-                  IndirectCallParams);
-  if (CA.analyzeCall(Call)) {
-    // We were able to inline the indirect call! Subtract the cost from the
-    // threshold to get the bonus we want to apply, but don't go below zero.
-    Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+  if (TTI.isLoweredToCall(F)) {
+    // We account for the average 1 instruction per call argument setup here.
+    addCost(Call.arg_size() * InlineConstants::InstrCost);
+
+    // If we have a constant that we are calling as a function, we can peer
+    // through it and see the function target. This happens not infrequently
+    // during devirtualization and so we want to give it a hefty bonus for
+    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
+    // Pretend to inline the function, with a custom threshold.
+    if (IsIndirectCall && BoostIndirectCalls) {
+      auto IndirectCallParams = Params;
+      IndirectCallParams.DefaultThreshold =
+          InlineConstants::IndirectCallThreshold;
+      CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
+                      IndirectCallParams, false);
+      if (CA.analyzeCall(Call)) {
+        // We were able to inline the indirect call! Subtract the cost from the
+        // threshold to get the bonus we want to apply, but don't go below zero.
+        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+      }
+    } else
+      // Otherwise simply add the cost for merely making the call.
+      addCost(InlineConstants::CallPenalty);
   }
 
-  if (!F->onlyReadsMemory())
+  if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
     disableLoadElimination();
   return Base::visitCallBase(Call);
 }
@@ -1494,7 +1494,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
 
   int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
   int64_t SwitchCost =
-      ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+    ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
 
   addCost(SwitchCost, (int64_t)CostUpperBound);
   return false;
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d997acb365c47..7942cb09e84c9 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -5086,6 +5086,11 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
         return Op0;
     }
     break;
+  case Intrinsic::copysign:
+    // copysign X, X --> X
+    if (Op0 == Op1)
+      return Op0;
+    break;
   case Intrinsic::maxnum:
   case Intrinsic::minnum:
   case Intrinsic::maximum:
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index bad2de9e5f5e0..78ad5859de4c6 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -136,12 +136,9 @@ namespace {
   /// A callback value handle updates the cache when values are erased.
   class LazyValueInfoCache;
   struct LVIValueHandle final : public CallbackVH {
-    // Needs to access getValPtr(), which is protected.
-    friend struct DenseMapInfo<LVIValueHandle>;
-
     LazyValueInfoCache *Parent;
 
-    LVIValueHandle(Value *V, LazyValueInfoCache *P)
+    LVIValueHandle(Value *V, LazyValueInfoCache *P = nullptr)
       : CallbackVH(V), Parent(P) { }
 
     void deleted() override;
@@ -155,89 +152,63 @@ namespace {
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
-    /// This is all of the cached block information for exactly one Value*.
-    /// The entries are sorted by the BasicBlock* of the
-    /// entries, allowing us to do a lookup with a binary search.
-    /// Over-defined lattice values are recorded in OverDefinedCache to reduce
-    /// memory overhead.
-    struct ValueCacheEntryTy {
-      ValueCacheEntryTy(Value *V, LazyValueInfoCache *P) : Handle(V, P) {}
-      LVIValueHandle Handle;
-      SmallDenseMap<PoisoningVH<BasicBlock>, ValueLatticeElement, 4> BlockVals;
+    /// This is all of the cached information for one basic block. It contains
+    /// the per-value lattice elements, as well as a separate set for
+    /// overdefined values to reduce memory usage.
+    struct BlockCacheEntryTy {
+      SmallDenseMap<AssertingVH<Value>, ValueLatticeElement, 4> LatticeElements;
+      SmallDenseSet<AssertingVH<Value>, 4> OverDefined;
     };
 
-    /// This tracks, on a per-block basis, the set of values that are
-    /// over-defined at the end of that block.
-    typedef DenseMap<PoisoningVH<BasicBlock>, SmallPtrSet<Value *, 4>>
-        OverDefinedCacheTy;
-    /// Keep track of all blocks that we have ever seen, so we
-    /// don't spend time removing unused blocks from our caches.
-    DenseSet<PoisoningVH<BasicBlock> > SeenBlocks;
-
-    /// This is all of the cached information for all values,
-    /// mapped from Value* to key information.
-    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
-    OverDefinedCacheTy OverDefinedCache;
-
+    /// Cached information per basic block.
+    DenseMap<PoisoningVH<BasicBlock>, BlockCacheEntryTy> BlockCache;
+    /// Set of value handles used to erase values from the cache on deletion.
+    DenseSet<LVIValueHandle, DenseMapInfo<Value *>> ValueHandles;
 
   public:
     void insertResult(Value *Val, BasicBlock *BB,
                       const ValueLatticeElement &Result) {
-      SeenBlocks.insert(BB);
-
+      auto &CacheEntry = BlockCache.try_emplace(BB).first->second;
       // Insert over-defined values into their own cache to reduce memory
       // overhead.
       if (Result.isOverdefined())
-        OverDefinedCache[BB].insert(Val);
-      else {
-        auto It = ValueCache.find_as(Val);
-        if (It == ValueCache.end()) {
-          ValueCache[Val] = std::make_unique<ValueCacheEntryTy>(Val, this);
-          It = ValueCache.find_as(Val);
-          assert(It != ValueCache.end() && "Val was just added to the map!");
-        }
-        It->second->BlockVals[BB] = Result;
-      }
-    }
-
-    bool isOverdefined(Value *V, BasicBlock *BB) const {
-      auto ODI = OverDefinedCache.find(BB);
-
-      if (ODI == OverDefinedCache.end())
-        return false;
+        CacheEntry.OverDefined.insert(Val);
+      else
+        CacheEntry.LatticeElements.insert({ Val, Result });
 
-      return ODI->second.count(V);
+      auto HandleIt = ValueHandles.find_as(Val);
+      if (HandleIt == ValueHandles.end())
+        ValueHandles.insert({ Val, this });
     }
 
     bool hasCachedValueInfo(Value *V, BasicBlock *BB) const {
-      if (isOverdefined(V, BB))
-        return true;
-
-      auto I = ValueCache.find_as(V);
-      if (I == ValueCache.end())
+      auto It = BlockCache.find(BB);
+      if (It == BlockCache.end())
         return false;
 
-      return I->second->BlockVals.count(BB);
+      return It->second.OverDefined.count(V) ||
+             It->second.LatticeElements.count(V);
     }
 
     ValueLatticeElement getCachedValueInfo(Value *V, BasicBlock *BB) const {
-      if (isOverdefined(V, BB))
+      auto It = BlockCache.find(BB);
+      if (It == BlockCache.end())
+        return ValueLatticeElement();
+
+      if (It->second.OverDefined.count(V))
         return ValueLatticeElement::getOverdefined();
 
-      auto I = ValueCache.find_as(V);
-      if (I == ValueCache.end())
+      auto LatticeIt = It->second.LatticeElements.find(V);
+      if (LatticeIt == It->second.LatticeElements.end())
         return ValueLatticeElement();
-      auto BBI = I->second->BlockVals.find(BB);
-      if (BBI == I->second->BlockVals.end())
-        return ValueLatticeElement();
-      return BBI->second;
+
+      return LatticeIt->second;
     }
 
     /// clear - Empty the cache.
     void clear() {
-      SeenBlocks.clear();
-      ValueCache.clear();
-      OverDefinedCache.clear();
+      BlockCache.clear();
+      ValueHandles.clear();
     }
 
     /// Inform the cache that a given value has been deleted.
@@ -251,23 +222,18 @@ namespace {
     /// OldSucc might have (unless also overdefined in NewSucc).  This just
     /// flushes elements from the cache and does not add any.
     void threadEdgeImpl(BasicBlock *OldSucc,BasicBlock *NewSucc);
-
-    friend struct LVIValueHandle;
   };
 }
 
 void LazyValueInfoCache::eraseValue(Value *V) {
-  for (auto I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E;) {
-    // Copy and increment the iterator immediately so we can erase behind
-    // ourselves.
-    auto Iter = I++;
-    SmallPtrSetImpl<Value *> &ValueSet = Iter->second;
-    ValueSet.erase(V);
-    if (ValueSet.empty())
-      OverDefinedCache.erase(Iter);
+  for (auto &Pair : BlockCache) {
+    Pair.second.LatticeElements.erase(V);
+    Pair.second.OverDefined.erase(V);
   }
 
-  ValueCache.erase(V);
+  auto HandleIt = ValueHandles.find_as(V);
+  if (HandleIt != ValueHandles.end())
+    ValueHandles.erase(HandleIt);
 }
 
 void LVIValueHandle::deleted() {
@@ -277,18 +243,7 @@ void LVIValueHandle::deleted() {
 }
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
-  // Shortcut if we have never seen this block.
-  DenseSet<PoisoningVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
-  if (I == SeenBlocks.end())
-    return;
-  SeenBlocks.erase(I);
-
-  auto ODI = OverDefinedCache.find(BB);
-  if (ODI != OverDefinedCache.end())
-    OverDefinedCache.erase(ODI);
-
-  for (auto &I : ValueCache)
-    I.second->BlockVals.erase(BB);
+  BlockCache.erase(BB);
 }
 
 void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
@@ -306,10 +261,11 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
   std::vector<BasicBlock*> worklist;
   worklist.push_back(OldSucc);
 
-  auto I = OverDefinedCache.find(OldSucc);
-  if (I == OverDefinedCache.end())
+  auto I = BlockCache.find(OldSucc);
+  if (I == BlockCache.end() || I->second.OverDefined.empty())
     return; // Nothing to process here.
-  SmallVector<Value *, 4> ValsToClear(I->second.begin(), I->second.end());
+  SmallVector<Value *, 4> ValsToClear(I->second.OverDefined.begin(),
+                                      I->second.OverDefined.end());
 
   // Use a worklist to perform a depth-first search of OldSucc's successors.
   // NOTE: We do not need a visited list since any blocks we have already
@@ -323,10 +279,10 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
     if (ToUpdate == NewSucc) continue;
 
     // If a value was marked overdefined in OldSucc, and is here too...
-    auto OI = OverDefinedCache.find(ToUpdate);
-    if (OI == OverDefinedCache.end())
+    auto OI = BlockCache.find(ToUpdate);
+    if (OI == BlockCache.end() || OI->second.OverDefined.empty())
       continue;
-    SmallPtrSetImpl<Value *> &ValueSet = OI->second;
+    auto &ValueSet = OI->second.OverDefined;
 
     bool changed = false;
     for (Value *V : ValsToClear) {
@@ -336,11 +292,6 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
       // If we removed anything, then we potentially need to update
       // blocks successors too.
       changed = true;
-
-      if (ValueSet.empty()) {
-        OverDefinedCache.erase(OI);
-        break;
-      }
     }
 
     if (!changed) continue;
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 66c43cb451118..d635afb0a299c 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -5717,10 +5717,11 @@ ScalarEvolution::getRangeRef(const SCEV *S,
     if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) {
       // For a SCEVUnknown, ask ValueTracking.
       KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
-      if (Known.One != ~Known.Zero + 1)
-        ConservativeResult =
-            ConservativeResult.intersectWith(
-                ConstantRange(Known.One, ~Known.Zero + 1), RangeType);
+      // If Known does not result in full-set, intersect with it.
+      if (Known.getMinValue() != Known.getMaxValue() + 1)
+        ConservativeResult = ConservativeResult.intersectWith(
+            ConstantRange(Known.getMinValue(), Known.getMaxValue() + 1),
+            RangeType);
     } else {
       assert(SignHint == ScalarEvolution::HINT_RANGE_SIGNED &&
              "generalize as needed!");
@@ -12040,6 +12041,12 @@ ScalarEvolution ScalarEvolutionAnalysis::run(Function &F,
                          AM.getResult<LoopAnalysis>(F));
 }
 
+PreservedAnalyses
+ScalarEvolutionVerifierPass::run(Function &F, FunctionAnalysisManager &AM) {
+  AM.getResult<ScalarEvolutionAnalysis>(F).verify();
+  return PreservedAnalyses::all();
+}
+
 PreservedAnalyses
 ScalarEvolutionPrinterPass::run(Function &F, FunctionAnalysisManager &AM) {
   AM.getResult<ScalarEvolutionAnalysis>(F).print(OS);
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index 067283d38b66f..a331b95e818b2 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -402,8 +402,8 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName) {
     assert(Parameters.back().ParamKind == VFParamKind::GlobalPredicate &&
            "The global predicate must be the last parameter");
 
-  const VFShape Shape({VF, IsScalable, ISA, Parameters});
-  return VFInfo({Shape, ScalarName, VectorName});
+  const VFShape Shape({VF, IsScalable, Parameters});
+  return VFInfo({Shape, ScalarName, VectorName, ISA});
 }
 
 VFParamKind VFABI::getVFParamKindFromString(const StringRef Token) {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 51d92cca214ba..f46bae77ba269 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -915,7 +915,7 @@ static void computeKnownBitsFromShiftOperator(
   // If the shift amount could be greater than or equal to the bit-width of the
   // LHS, the value could be poison, but bail out because the check below is
   // expensive. TODO: Should we just carry on?
-  if ((~Known.Zero).uge(BitWidth)) {
+  if (Known.getMaxValue().uge(BitWidth)) {
     Known.resetAll();
     return;
   }
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 44043bd582c6e..c45ab941a1428 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1182,3 +1182,47 @@ void VFABI::getVectorVariantNames(
     VariantMappings.push_back(S);
   }
 }
+
+bool VFShape::hasValidParameterList() const {
+  for (unsigned Pos = 0, NumParams = Parameters.size(); Pos < NumParams;
+       ++Pos) {
+    assert(Parameters[Pos].ParamPos == Pos && "Broken parameter list.");
+
+    switch (Parameters[Pos].ParamKind) {
+    default: // Nothing to check.
+      break;
+    case VFParamKind::OMP_Linear:
+    case VFParamKind::OMP_LinearRef:
+    case VFParamKind::OMP_LinearVal:
+    case VFParamKind::OMP_LinearUVal:
+      // Compile time linear steps must be non-zero.
+      if (Parameters[Pos].LinearStepOrPos == 0)
+        return false;
+      break;
+    case VFParamKind::OMP_LinearPos:
+    case VFParamKind::OMP_LinearRefPos:
+    case VFParamKind::OMP_LinearValPos:
+    case VFParamKind::OMP_LinearUValPos:
+      // The runtime linear step must be referring to some other
+      // parameters in the signature.
+      if (Parameters[Pos].LinearStepOrPos >= int(NumParams))
+        return false;
+      // The linear step parameter must be marked as uniform.
+      if (Parameters[Parameters[Pos].LinearStepOrPos].ParamKind !=
+          VFParamKind::OMP_Uniform)
+        return false;
+      // The linear step parameter can't point at itself.
+      if (Parameters[Pos].LinearStepOrPos == int(Pos))
+        return false;
+      break;
+    case VFParamKind::GlobalPredicate:
+      // The global predicate must be the unique. Can be placed anywhere in the
+      // signature.
+      for (unsigned NextPos = Pos + 1; NextPos < NumParams; ++NextPos)
+        if (Parameters[NextPos].ParamKind == VFParamKind::GlobalPredicate)
+          return false;
+      break;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/BinaryFormat/XCOFF.cpp b/llvm/lib/BinaryFormat/XCOFF.cpp
index 001b8077cd3d1..29ccbaea3584d 100644
--- a/llvm/lib/BinaryFormat/XCOFF.cpp
+++ b/llvm/lib/BinaryFormat/XCOFF.cpp
@@ -24,6 +24,10 @@ StringRef XCOFF::getMappingClassString(XCOFF::StorageMappingClass SMC) {
     return "BS";
   case XCOFF::XMC_RO:
     return "RO";
+  case XCOFF::XMC_UA:
+    return "UA";
+  case XCOFF::XMC_TC:
+    return "TC";
   default:
     report_fatal_error("Unhandled storage-mapping class.");
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index c4d5b717b25df..84b86a71fa5fe 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -319,8 +319,10 @@ DIEUnit::DIEUnit(uint16_t V, uint8_t A, dwarf::Tag UnitTag)
 {
   Die.Owner = this;
   assert((UnitTag == dwarf::DW_TAG_compile_unit ||
+          UnitTag == dwarf::DW_TAG_skeleton_unit ||
           UnitTag == dwarf::DW_TAG_type_unit ||
-          UnitTag == dwarf::DW_TAG_partial_unit) && "expected a unit TAG");
+          UnitTag == dwarf::DW_TAG_partial_unit) &&
+         "expected a unit TAG");
 }
 
 void DIEValue::EmitValue(const AsmPrinter *AP) const {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 9578e01abdd47..4e90c10e3e9d8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -52,10 +52,23 @@
 
 using namespace llvm;
 
+static dwarf::Tag GetCompileUnitType(UnitKind Kind, DwarfDebug *DW) {
+
+  //  According to DWARF Debugging Information Format Version 5,
+  //  3.1.2 Skeleton Compilation Unit Entries:
+  //  "When generating a split DWARF object file (see Section 7.3.2
+  //  on page 187), the compilation unit in the .debug_info section
+  //  is a "skeleton" compilation unit with the tag DW_TAG_skeleton_unit"
+  if (DW->getDwarfVersion() >= 5 && Kind == UnitKind::Skeleton)
+    return dwarf::DW_TAG_skeleton_unit;
+
+  return dwarf::DW_TAG_compile_unit;
+}
+
 DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
                                    AsmPrinter *A, DwarfDebug *DW,
-                                   DwarfFile *DWU)
-    : DwarfUnit(dwarf::DW_TAG_compile_unit, Node, A, DW, DWU), UniqueID(UID) {
+                                   DwarfFile *DWU, UnitKind Kind)
+    : DwarfUnit(GetCompileUnitType(Kind, DW), Node, A, DW, DWU), UniqueID(UID) {
   insertDIE(Node, &getUnitDie());
   MacroLabelBegin = Asm->createTempSymbol("cu_macro_begin");
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 1b7ea2673ac09..8491d078ed899 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -40,6 +40,8 @@ class MCExpr;
 class MCSymbol;
 class MDNode;
 
+enum class UnitKind { Skeleton, Full };
+
 class DwarfCompileUnit final : public DwarfUnit {
   /// A numeric ID unique among all CUs in the module
   unsigned UniqueID;
@@ -104,7 +106,8 @@ class DwarfCompileUnit final : public DwarfUnit {
 
 public:
   DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
-                   DwarfDebug *DW, DwarfFile *DWU);
+                   DwarfDebug *DW, DwarfFile *DWU,
+                   UnitKind Kind = UnitKind::Full);
 
   bool hasRangeLists() const { return HasRangeLists; }
   unsigned getUniqueID() const { return UniqueID; }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 551e8a2751b5f..09772537a97b8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -617,6 +617,10 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
 
   // Search for a loading value in forwarding registers.
   for (; I != MBB->rend(); ++I) {
+    // Skip bundle headers.
+    if (I->isBundle())
+      continue;
+
     // If the next instruction is a call we can not interpret parameter's
     // forwarding registers or we finished the interpretation of all parameters.
     if (I->isCall())
@@ -1169,7 +1173,7 @@ void DwarfDebug::finalizeModuleInfo() {
 
     auto *CUNode = cast<DICompileUnit>(P.first);
     // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
-    if (CUNode->getMacros())
+    if (CUNode->getMacros() && !useSplitDwarf())
       U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
                         U.getMacroLabelBegin(),
                         TLOF.getDwarfMacinfoSection()->getBeginSymbol());
@@ -1208,10 +1212,10 @@ void DwarfDebug::endModule() {
   emitDebugStr();
 
   if (useSplitDwarf())
-    // Handles debug_loc.dwo / debug_loclists.dwo section emission
+    // Emit debug_loc.dwo/debug_loclists.dwo section.
     emitDebugLocDWO();
   else
-    // Handles debug_loc / debug_loclists section emission
+    // Emit debug_loc/debug_loclists section.
     emitDebugLoc();
 
   // Corresponding abbreviations into a abbrev section.
@@ -1227,8 +1231,12 @@ void DwarfDebug::endModule() {
   // Emit info into a debug ranges section.
   emitDebugRanges();
 
+  if (useSplitDwarf())
+  // Emit info into a debug macinfo.dwo section.
+    emitDebugMacinfoDWO();
+  else
   // Emit info into a debug macinfo section.
-  emitDebugMacinfo();
+    emitDebugMacinfo();
 
   if (useSplitDwarf()) {
     emitDebugStrDWO();
@@ -2783,6 +2791,24 @@ void DwarfDebug::emitDebugMacinfo() {
   }
 }
 
+void DwarfDebug::emitDebugMacinfoDWO() {
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    auto *SkCU = TheCU.getSkeleton();
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    DIMacroNodeArray Macros = CUNode->getMacros();
+    if (Macros.empty())
+      continue;
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfMacinfoDWOSection());
+    Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
+    handleMacroNodes(Macros, U);
+    Asm->OutStreamer->AddComment("End Of Macro List Mark");
+    Asm->emitInt8(0);
+  }
+}
+
 // DWARF5 Experimental Separate Dwarf emitters.
 
 void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
@@ -2799,7 +2825,8 @@ void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
 DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
   auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
-      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
+      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder,
+      UnitKind::Skeleton);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 85016074e2519..03949dbddea69 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -508,6 +508,8 @@ class DwarfDebug : public DebugHandlerBase {
 
   /// Emit macros into a debug macinfo section.
   void emitDebugMacinfo();
+  /// Emit macros into a debug macinfo.dwo section.
+  void emitDebugMacinfoDWO();
   void emitMacro(DIMacro &M);
   void emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U);
   void handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 9d7fee1d5b389..86522a85427a5 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -800,6 +800,15 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   if (!Name.empty())
     addString(Buffer, dwarf::DW_AT_name, Name);
 
+  // If alignment is specified for a typedef , create and insert DW_AT_alignment
+  // attribute in DW_TAG_typedef DIE.
+  if (Tag == dwarf::DW_TAG_typedef && DD->getDwarfVersion() >= 5) {
+    uint32_t AlignInBytes = DTy->getAlignInBytes();
+    if (AlignInBytes > 0)
+      addUInt(Buffer, dwarf::DW_AT_alignment, dwarf::DW_FORM_udata,
+              AlignInBytes);
+  }
+
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type
            && Tag != dwarf::DW_TAG_ptr_to_member_type
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 8875568c5938f..c10c3f4d78634 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -163,6 +163,7 @@ add_llvm_component_library(LLVMCodeGen
   TargetRegisterInfo.cpp
   TargetSchedule.cpp
   TargetSubtargetInfo.cpp
+  TypePromotion.cpp
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   ValueTypes.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 85696ccc482a7..20fc67cc66ae7 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -105,6 +105,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeTailDuplicatePass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
+  initializeTypePromotionPass(Registry);
   initializeUnpackMachineBundlesPass(Registry);
   initializeUnreachableBlockElimLegacyPassPass(Registry);
   initializeUnreachableMachineBlockElimPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 89f69bdf37e97..0689f8e4f0c30 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -1054,7 +1054,7 @@ bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
       // Collect all the relocate calls associated with a statepoint
       AllRelocateCalls.push_back(Relocate);
 
-  // We need atleast one base pointer relocation + one derived pointer
+  // We need at least one base pointer relocation + one derived pointer
   // relocation to mangle
   if (AllRelocateCalls.size() < 2)
     return false;
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 702e7e244bcec..8d9d48402b311 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -261,15 +261,25 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
 
-      if (MO.isRegMask())
-        for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
-          if (MO.clobbersPhysReg(i)) {
+      if (MO.isRegMask()) {
+        auto ClobbersPhysRegAndSubRegs = [&](unsigned PhysReg) {
+          for (MCSubRegIterator SRI(PhysReg, TRI, true); SRI.isValid(); ++SRI)
+            if (!MO.clobbersPhysReg(*SRI))
+              return false;
+
+          return true;
+        };
+
+        for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+          if (ClobbersPhysRegAndSubRegs(i)) {
             DefIndices[i] = Count;
             KillIndices[i] = ~0u;
             KeepRegs.reset(i);
             Classes[i] = nullptr;
             RegRefs.erase(i);
           }
+        }
+      }
 
       if (!MO.isReg()) continue;
       Register Reg = MO.getReg();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index bcf31e16142cf..6712ff5c732d8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -74,12 +74,35 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
     return false;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
+
+  // Give up if either DstReg or SrcReg  is a physical register.
+  if (Register::isPhysicalRegister(DstReg) ||
+      Register::isPhysicalRegister(SrcReg))
+    return false;
+
+  // Give up the types don't match.
   LLT DstTy = MRI.getType(DstReg);
   LLT SrcTy = MRI.getType(SrcReg);
-  // Simple Copy Propagation.
-  // a(sx) = COPY b(sx) -> Replace all uses of a with b.
-  if (DstTy.isValid() && SrcTy.isValid() && DstTy == SrcTy)
+  // Give up if one has a valid LLT, but the other doesn't.
+  if (DstTy.isValid() != SrcTy.isValid())
+    return false;
+  // Give up if the types don't match.
+  if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
+    return false;
+
+  // Get the register banks and classes.
+  const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
+  const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
+  const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
+  const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
+
+  // Replace if the register constraints match.
+  if ((SrcRC == DstRC) && (SrcBank == DstBank))
     return true;
+  // Replace if DstReg has no constraints.
+  if (!DstBank && !DstRC)
+    return true;
+
   return false;
 }
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index d8bcc59c7658e..5e1d5d9b579b6 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -1675,7 +1675,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_CONSTANT: {
     MachineOperand &SrcMO = MI.getOperand(1);
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
-    const APInt &Val = SrcMO.getCImm()->getValue().sext(WideTy.getSizeInBits());
+    unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
+        MRI.getType(MI.getOperand(0).getReg()));
+    assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
+            ExtOpc == TargetOpcode::G_ANYEXT) &&
+           "Illegal Extend");
+    const APInt &SrcVal = SrcMO.getCImm()->getValue();
+    const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
+                           ? SrcVal.sext(WideTy.getSizeInBits())
+                           : SrcVal.zext(WideTy.getSizeInBits());
     Observer.changingInstr(MI);
     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
 
@@ -2109,7 +2117,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
       default:
         llvm_unreachable("Unexpected opcode");
       case TargetOpcode::G_LOAD:
-        MIRBuilder.buildAnyExt(DstReg, TmpReg);
+        MIRBuilder.buildExtOrTrunc(TargetOpcode::G_ANYEXT, DstReg, TmpReg);
         break;
       case TargetOpcode::G_SEXTLOAD:
         MIRBuilder.buildSExt(DstReg, TmpReg);
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 70045512fae51..f897f9c7e20aa 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -685,6 +685,10 @@ bool LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   return true;
 }
 
+unsigned LegalizerInfo::getExtOpcodeForWideningConstant(LLT SmallTy) const {
+  return SmallTy.isByteSized() ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
+}
+
 /// \pre Type indices of every opcode form a dense set starting from 0.
 void LegalizerInfo::verify(const MCInstrInfo &MII) const {
 #ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 4e54437947ff6..b3ca4c1d8020b 100644
--- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -698,7 +698,7 @@ void ImplicitNullChecks::rewriteNullChecks(
 
     if (auto *DepMI = NC.getOnlyDependency()) {
       for (auto &MO : DepMI->operands()) {
-        if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+        if (!MO.isReg() || !MO.getReg() || !MO.isDef() || MO.isDead())
           continue;
         if (!NC.getNotNullSucc()->isLiveIn(MO.getReg()))
           NC.getNotNullSucc()->addLiveIn(MO.getReg());
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 2408f18678e46..75d978472cf35 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -543,8 +543,7 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(unsigned VReg,
 bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   // Analyze instruction
   SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
-  MIBundleOperands::VirtRegInfo RI =
-      MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+  VirtRegInfo RI = AnalyzeVirtRegInBundle(MI, VirtReg.reg, &Ops);
 
   if (!RI.Reads)
     return false;
@@ -782,7 +781,7 @@ static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
 /// foldMemoryOperand - Try folding stack slot references in Ops into their
 /// instructions.
 ///
-/// @param Ops    Operand indices from analyzeVirtReg().
+/// @param Ops    Operand indices from AnalyzeVirtRegInBundle().
 /// @param LoadMI Load instruction to use instead of stack slot when non-null.
 /// @return       True on success.
 bool InlineSpiller::
@@ -851,8 +850,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     // Skip non-Defs, including undef uses and internal reads.
     if (MO->isUse())
       continue;
-    MIBundleOperands::PhysRegInfo RI =
-        MIBundleOperands(*FoldMI).analyzePhysReg(Reg, &TRI);
+    PhysRegInfo RI = AnalyzePhysRegInBundle(*FoldMI, Reg, &TRI);
     if (RI.FullyDefined)
       continue;
     // FoldMI does not define this physreg. Remove the LI segment.
@@ -992,8 +990,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Analyze instruction.
     SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-    MIBundleOperands::VirtRegInfo RI =
-        MIBundleOperands(*MI).analyzeVirtReg(Reg, &Ops);
+    VirtRegInfo RI = AnalyzeVirtRegInBundle(*MI, Reg, &Ops);
 
     // Find the slot index where this instruction reads and writes OldLI.
     // This is usually the def slot, except for tied early clobbers.
diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index 7734f5e5ef707..04efa7bc35e96 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -144,60 +144,6 @@ class LiveDebugValues : public MachineFunctionPass {
   using FragmentInfo = DIExpression::FragmentInfo;
   using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
 
-  /// Storage for identifying a potentially inlined instance of a variable,
-  /// or a fragment thereof.
-  class DebugVariable {
-    const DILocalVariable *Variable;
-    OptFragmentInfo Fragment;
-    const DILocation *InlinedAt;
-
-    /// Fragment that will overlap all other fragments. Used as default when
-    /// caller demands a fragment.
-    static const FragmentInfo DefaultFragment;
-
-  public:
-    DebugVariable(const DILocalVariable *Var, OptFragmentInfo &&FragmentInfo,
-                  const DILocation *InlinedAt)
-        : Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}
-
-    DebugVariable(const DILocalVariable *Var, OptFragmentInfo &FragmentInfo,
-                  const DILocation *InlinedAt)
-        : Variable(Var), Fragment(FragmentInfo), InlinedAt(InlinedAt) {}
-
-    DebugVariable(const DILocalVariable *Var, const DIExpression *DIExpr,
-                  const DILocation *InlinedAt)
-        : DebugVariable(Var, DIExpr->getFragmentInfo(), InlinedAt) {}
-
-    DebugVariable(const MachineInstr &MI)
-        : DebugVariable(MI.getDebugVariable(),
-                        MI.getDebugExpression()->getFragmentInfo(),
-                        MI.getDebugLoc()->getInlinedAt()) {}
-
-    const DILocalVariable *getVar() const { return Variable; }
-    const OptFragmentInfo &getFragment() const { return Fragment; }
-    const DILocation *getInlinedAt() const { return InlinedAt; }
-
-    const FragmentInfo getFragmentDefault() const {
-      return Fragment.getValueOr(DefaultFragment);
-    }
-
-    static bool isFragmentDefault(FragmentInfo &F) {
-      return F == DefaultFragment;
-    }
-
-    bool operator==(const DebugVariable &Other) const {
-      return std::tie(Variable, Fragment, InlinedAt) ==
-             std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
-    }
-
-    bool operator<(const DebugVariable &Other) const {
-      return std::tie(Variable, Fragment, InlinedAt) <
-             std::tie(Other.Variable, Other.Fragment, Other.InlinedAt);
-    }
-  };
-
-  friend struct llvm::DenseMapInfo<DebugVariable>;
-
   /// A pair of debug variable and value location.
   struct VarLoc {
     // The location at which a spilled variable resides. It consists of a
@@ -226,7 +172,9 @@ class LiveDebugValues : public MachineFunctionPass {
       RegisterKind,
       SpillLocKind,
       ImmediateKind,
-      EntryValueKind
+      EntryValueKind,
+      EntryValueBackupKind,
+      EntryValueCopyBackupKind
     } Kind = InvalidKind;
 
     /// The value location. Stored separately to avoid repeatedly
@@ -241,14 +189,15 @@ class LiveDebugValues : public MachineFunctionPass {
     } Loc;
 
     VarLoc(const MachineInstr &MI, LexicalScopes &LS)
-        : Var(MI), Expr(MI.getDebugExpression()), MI(MI),
-          UVS(MI.getDebugLoc(), LS) {
+        : Var(MI.getDebugVariable(), MI.getDebugExpression(),
+              MI.getDebugLoc()->getInlinedAt()),
+          Expr(MI.getDebugExpression()), MI(MI), UVS(MI.getDebugLoc(), LS) {
       static_assert((sizeof(Loc) == sizeof(uint64_t)),
                     "hash does not cover all members of Loc");
       assert(MI.isDebugValue() && "not a DBG_VALUE");
       assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
       if (int RegNo = isDbgValueDescribedByReg(MI)) {
-        Kind = MI.isDebugEntryValue() ? EntryValueKind : RegisterKind;
+        Kind = RegisterKind;
         Loc.RegNo = RegNo;
       } else if (MI.getOperand(0).isImm()) {
         Kind = ImmediateKind;
@@ -260,17 +209,50 @@ class LiveDebugValues : public MachineFunctionPass {
         Kind = ImmediateKind;
         Loc.CImm = MI.getOperand(0).getCImm();
       }
-      assert((Kind != ImmediateKind || !MI.isDebugEntryValue()) &&
-             "entry values must be register locations");
+
+      // We create the debug entry values from the factory functions rather than
+      // from this ctor.
+      assert(Kind != EntryValueKind && !isEntryBackupLoc());
     }
 
     /// Take the variable and machine-location in DBG_VALUE MI, and build an
     /// entry location using the given expression.
     static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
-                                 const DIExpression *EntryExpr) {
+                                 const DIExpression *EntryExpr, unsigned Reg) {
       VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
       VL.Kind = EntryValueKind;
       VL.Expr = EntryExpr;
+      VL.Loc.RegNo = Reg;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build an entry value backup location. The backup
+    /// location will turn into the normal location if the backup is valid at
+    /// the time of the primary location clobbering.
+    static VarLoc CreateEntryBackupLoc(const MachineInstr &MI,
+                                       LexicalScopes &LS,
+                                       const DIExpression *EntryExpr) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = EntryValueBackupKind;
+      VL.Expr = EntryExpr;
+      return VL;
+    }
+
+    /// Take the variable and machine-location from the DBG_VALUE (from the
+    /// function entry), and build a copy of an entry value backup location by
+    /// setting the register location to NewReg.
+    static VarLoc CreateEntryCopyBackupLoc(const MachineInstr &MI,
+                                           LexicalScopes &LS,
+                                           const DIExpression *EntryExpr,
+                                           unsigned NewReg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = EntryValueCopyBackupKind;
+      VL.Expr = EntryExpr;
+      VL.Loc.RegNo = NewReg;
       return VL;
     }
 
@@ -309,8 +291,11 @@ class LiveDebugValues : public MachineFunctionPass {
       switch (Kind) {
       case EntryValueKind:
         // An entry value is a register location -- but with an updated
-        // expression.
-        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, Expr);
+        // expression. The register location of such DBG_VALUE is always the one
+        // from the entry DBG_VALUE, it does not matter if the entry value was
+        // copied in to another register due to some optimizations.
+        return BuildMI(MF, DbgLoc, IID, Indirect, MI.getOperand(0).getReg(),
+                       Var, Expr);
       case RegisterKind:
         // Register locations are like the source DBG_VALUE, but with the
         // register number from this VarLoc.
@@ -329,8 +314,11 @@ class LiveDebugValues : public MachineFunctionPass {
         MachineOperand MO = MI.getOperand(0);
         return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
       }
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
       case InvalidKind:
-        llvm_unreachable("Tried to produce DBG_VALUE for invalid VarLoc");
+        llvm_unreachable(
+            "Tried to produce DBG_VALUE for invalid or backup VarLoc");
       }
       llvm_unreachable("Unrecognized LiveDebugValues.VarLoc.Kind enum");
     }
@@ -338,6 +326,27 @@ class LiveDebugValues : public MachineFunctionPass {
     /// Is the Loc field a constant or constant object?
     bool isConstant() const { return Kind == ImmediateKind; }
 
+    /// Check if the Loc field is an entry backup location.
+    bool isEntryBackupLoc() const {
+      return Kind == EntryValueBackupKind || Kind == EntryValueCopyBackupKind;
+    }
+
+    /// If this variable is described by a register holding the entry value,
+    /// return it, otherwise return 0.
+    unsigned getEntryValueBackupReg() const {
+      if (Kind == EntryValueBackupKind)
+        return Loc.RegNo;
+      return 0;
+    }
+
+    /// If this variable is described by a register holding the copy of the
+    /// entry value, return it, otherwise return 0.
+    unsigned getEntryValueCopyBackupReg() const {
+      if (Kind == EntryValueCopyBackupKind)
+        return Loc.RegNo;
+      return 0;
+    }
+
     /// If this variable is described by a register, return it,
     /// otherwise return 0.
     unsigned isDescribedByReg() const {
@@ -357,6 +366,8 @@ class LiveDebugValues : public MachineFunctionPass {
       switch (Kind) {
       case RegisterKind:
       case EntryValueKind:
+      case EntryValueBackupKind:
+      case EntryValueCopyBackupKind:
         dbgs() << printReg(Loc.RegNo, TRI);
         break;
       case SpillLocKind:
@@ -370,11 +381,17 @@ class LiveDebugValues : public MachineFunctionPass {
         llvm_unreachable("Invalid VarLoc in dump method");
       }
 
-      dbgs() << ", \"" << Var.getVar()->getName() << "\", " << *Expr << ", ";
+      dbgs() << ", \"" << Var.getVariable()->getName() << "\", " << *Expr
+             << ", ";
       if (Var.getInlinedAt())
         dbgs() << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
       else
-        dbgs() << "(null))\n";
+        dbgs() << "(null))";
+
+      if (isEntryBackupLoc())
+        dbgs() << " (backup loc)\n";
+      else
+        dbgs() << "\n";
     }
 #endif
 
@@ -390,7 +407,6 @@ class LiveDebugValues : public MachineFunctionPass {
     }
   };
 
-  using DebugParamMap = SmallDenseMap<const DILocalVariable *, MachineInstr *>;
   using VarLocMap = UniqueVector<VarLoc>;
   using VarLocSet = SparseBitVector<>;
   using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
@@ -416,10 +432,18 @@ class LiveDebugValues : public MachineFunctionPass {
   /// This holds the working set of currently open ranges. For fast
   /// access, this is done both as a set of VarLocIDs, and a map of
   /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
-  /// previous open ranges for the same variable.
+  /// previous open ranges for the same variable. In addition, we keep
+  /// two different maps (Vars/EntryValuesBackupVars), so erase/insert
+  /// methods act differently depending on whether a VarLoc is primary
+  /// location or backup one. In the case the VarLoc is backup location
+  /// we will erase/insert from the EntryValuesBackupVars map, otherwise
+  /// we perform the operation on the Vars.
   class OpenRangesSet {
     VarLocSet VarLocs;
+    // Map the DebugVariable to recent primary location ID.
     SmallDenseMap<DebugVariable, unsigned, 8> Vars;
+    // Map the DebugVariable to recent backup location ID.
+    SmallDenseMap<DebugVariable, unsigned, 8> EntryValuesBackupVars;
     OverlapMap &OverlappingFragments;
 
   public:
@@ -427,40 +451,38 @@ class LiveDebugValues : public MachineFunctionPass {
 
     const VarLocSet &getVarLocs() const { return VarLocs; }
 
-    /// Terminate all open ranges for Var by removing it from the set.
-    void erase(DebugVariable Var);
+    /// Terminate all open ranges for VL.Var by removing it from the set.
+    void erase(const VarLoc &VL);
 
     /// Terminate all open ranges listed in \c KillSet by removing
     /// them from the set.
-    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs) {
-      VarLocs.intersectWithComplement(KillSet);
-      for (unsigned ID : KillSet)
-        Vars.erase(VarLocIDs[ID].Var);
-    }
+    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs);
 
     /// Insert a new range into the set.
-    void insert(unsigned VarLocID, DebugVariable Var) {
-      VarLocs.set(VarLocID);
-      Vars.insert({Var, VarLocID});
-    }
+    void insert(unsigned VarLocID, const VarLoc &VL);
 
     /// Insert a set of ranges.
     void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
       for (unsigned Id : ToLoad) {
-        const VarLoc &Var = Map[Id];
-        insert(Id, Var.Var);
+        const VarLoc &VarL = Map[Id];
+        insert(Id, VarL);
       }
     }
 
+    llvm::Optional<unsigned> getEntryValueBackup(DebugVariable Var);
+
     /// Empty the set.
     void clear() {
       VarLocs.clear();
       Vars.clear();
+      EntryValuesBackupVars.clear();
     }
 
     /// Return whether the set is empty or not.
     bool empty() const {
-      assert(Vars.empty() == VarLocs.empty() && "open ranges are inconsistent");
+      assert(Vars.empty() == EntryValuesBackupVars.empty() &&
+             Vars.empty() == VarLocs.empty() &&
+             "open ranges are inconsistent");
       return VarLocs.empty();
     }
   };
@@ -502,21 +524,23 @@ class LiveDebugValues : public MachineFunctionPass {
                           VarLocMap &VarLocIDs);
   void transferSpillOrRestoreInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                                   VarLocMap &VarLocIDs, TransferMap &Transfers);
+  bool removeEntryValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                        VarLocMap &VarLocIDs, const VarLoc &EntryVL);
   void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
                        VarLocMap &VarLocIDs, TransferMap &Transfers,
-                       DebugParamMap &DebugEntryVals,
                        SparseBitVector<> &KillSet);
+  void recordEntryValue(const MachineInstr &MI,
+                        const DefinedRegsSet &DefinedRegs,
+                        OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs);
   void transferRegisterCopy(MachineInstr &MI, OpenRangesSet &OpenRanges,
                             VarLocMap &VarLocIDs, TransferMap &Transfers);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                           VarLocMap &VarLocIDs, TransferMap &Transfers,
-                           DebugParamMap &DebugEntryVals);
+                           VarLocMap &VarLocIDs, TransferMap &Transfers);
   bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
                           VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
 
   void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-               VarLocMap &VarLocIDs, TransferMap &Transfers,
-               DebugParamMap &DebugEntryVals);
+               VarLocMap &VarLocIDs, TransferMap &Transfers);
 
   void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
                              OverlapMap &OLapMap);
@@ -559,46 +583,10 @@ class LiveDebugValues : public MachineFunctionPass {
 
 } // end anonymous namespace
 
-namespace llvm {
-
-template <> struct DenseMapInfo<LiveDebugValues::DebugVariable> {
-  using DV = LiveDebugValues::DebugVariable;
-  using OptFragmentInfo = LiveDebugValues::OptFragmentInfo;
-  using FragmentInfo = LiveDebugValues::FragmentInfo;
-
-  // Empty key: no key should be generated that has no DILocalVariable.
-  static inline DV getEmptyKey() {
-    return DV(nullptr, OptFragmentInfo(), nullptr);
-  }
-
-  // Difference in tombstone is that the Optional is meaningful
-  static inline DV getTombstoneKey() {
-    return DV(nullptr, OptFragmentInfo({0, 0}), nullptr);
-  }
-
-  static unsigned getHashValue(const DV &D) {
-    unsigned HV = 0;
-    const OptFragmentInfo &Fragment = D.getFragment();
-    if (Fragment)
-      HV = DenseMapInfo<FragmentInfo>::getHashValue(*Fragment);
-
-    return hash_combine(D.getVar(), HV, D.getInlinedAt());
-  }
-
-  static bool isEqual(const DV &A, const DV &B) { return A == B; }
-};
-
-} // namespace llvm
-
 //===----------------------------------------------------------------------===//
 //            Implementation
 //===----------------------------------------------------------------------===//
 
-const DIExpression::FragmentInfo
-    LiveDebugValues::DebugVariable::DefaultFragment = {
-        std::numeric_limits<uint64_t>::max(),
-        std::numeric_limits<uint64_t>::min()};
-
 char LiveDebugValues::ID = 0;
 
 char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
@@ -619,38 +607,72 @@ void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 /// Erase a variable from the set of open ranges, and additionally erase any
-/// fragments that may overlap it.
-void LiveDebugValues::OpenRangesSet::erase(DebugVariable Var) {
+/// fragments that may overlap it. If the VarLoc is a buckup location, erase
+/// the variable from the EntryValuesBackupVars set, indicating we should stop
+/// tracking its backup entry location. Otherwise, if the VarLoc is primary
+/// location, erase the variable from the Vars set.
+void LiveDebugValues::OpenRangesSet::erase(const VarLoc &VL) {
   // Erasure helper.
-  auto DoErase = [this](DebugVariable VarToErase) {
-    auto It = Vars.find(VarToErase);
-    if (It != Vars.end()) {
+  auto DoErase = [VL, this](DebugVariable VarToErase) {
+    auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    auto It = EraseFrom->find(VarToErase);
+    if (It != EraseFrom->end()) {
       unsigned ID = It->second;
       VarLocs.reset(ID);
-      Vars.erase(It);
+      EraseFrom->erase(It);
     }
   };
 
+  DebugVariable Var = VL.Var;
+
   // Erase the variable/fragment that ends here.
   DoErase(Var);
 
   // Extract the fragment. Interpret an empty fragment as one that covers all
   // possible bits.
-  FragmentInfo ThisFragment = Var.getFragmentDefault();
+  FragmentInfo ThisFragment = Var.getFragmentOrDefault();
 
   // There may be fragments that overlap the designated fragment. Look them up
   // in the pre-computed overlap map, and erase them too.
-  auto MapIt = OverlappingFragments.find({Var.getVar(), ThisFragment});
+  auto MapIt = OverlappingFragments.find({Var.getVariable(), ThisFragment});
   if (MapIt != OverlappingFragments.end()) {
     for (auto Fragment : MapIt->second) {
       LiveDebugValues::OptFragmentInfo FragmentHolder;
-      if (!DebugVariable::isFragmentDefault(Fragment))
+      if (!DebugVariable::isDefaultFragment(Fragment))
         FragmentHolder = LiveDebugValues::OptFragmentInfo(Fragment);
-      DoErase({Var.getVar(), FragmentHolder, Var.getInlinedAt()});
+      DoErase({Var.getVariable(), FragmentHolder, Var.getInlinedAt()});
     }
   }
 }
 
+void LiveDebugValues::OpenRangesSet::erase(const VarLocSet &KillSet,
+                                           const VarLocMap &VarLocIDs) {
+  VarLocs.intersectWithComplement(KillSet);
+  for (unsigned ID : KillSet) {
+    const VarLoc *VL = &VarLocIDs[ID];
+    auto *EraseFrom = VL->isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+    EraseFrom->erase(VL->Var);
+  }
+}
+
+void LiveDebugValues::OpenRangesSet::insert(unsigned VarLocID,
+                                            const VarLoc &VL) {
+  auto *InsertInto = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
+  VarLocs.set(VarLocID);
+  InsertInto->insert({VL.Var, VarLocID});
+}
+
+/// Return the Loc ID of an entry value backup location, if it exists for the
+/// variable.
+llvm::Optional<unsigned>
+LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
+  auto It = EntryValuesBackupVars.find(Var);
+  if (It != EntryValuesBackupVars.end())
+    return It->second;
+
+  return llvm::None;
+}
+
 //===----------------------------------------------------------------------===//
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
@@ -669,7 +691,7 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
     Out << "MBB: " << BB.getNumber() << ":\n";
     for (unsigned VLL : L) {
       const VarLoc &VL = VarLocIDs[VLL];
-      Out << " Var: " << VL.Var.getVar()->getName();
+      Out << " Var: " << VL.Var.getVariable()->getName();
       Out << " MI: ";
       VL.dump(TRI, Out);
     }
@@ -693,6 +715,62 @@ LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
   return {Reg, Offset};
 }
 
+/// Try to salvage the debug entry value if we encounter a new debug value
+/// describing the same parameter, otherwise stop tracking the value. Return
+/// true if we should stop tracking the entry value, otherwise return false.
+bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
+                                       OpenRangesSet &OpenRanges,
+                                       VarLocMap &VarLocIDs,
+                                       const VarLoc &EntryVL) {
+  // Skip the DBG_VALUE which is the debug entry value itself.
+  if (MI.isIdenticalTo(EntryVL.MI))
+    return false;
+
+  // If the parameter's location is not register location, we can not track
+  // the entry value any more. In addition, if the debug expression from the
+  // DBG_VALUE is not empty, we can assume the parameter's value has changed
+  // indicating that we should stop tracking its entry value as well.
+  if (!MI.getOperand(0).isReg() ||
+      MI.getDebugExpression()->getNumElements() != 0)
+    return true;
+
+  // If the DBG_VALUE comes from a copy instruction that copies the entry value,
+  // it means the parameter's value has not changed and we should be able to use
+  // its entry value.
+  bool TrySalvageEntryValue = false;
+  Register Reg = MI.getOperand(0).getReg();
+  auto I = std::next(MI.getReverseIterator());
+  const MachineOperand *SrcRegOp, *DestRegOp;
+  if (I != MI.getParent()->rend()) {
+    // TODO: Try to keep tracking of an entry value if we encounter a propagated
+    // DBG_VALUE describing the copy of the entry value. (Propagated entry value
+    // does not indicate the parameter modification.)
+    auto DestSrc = TII->isCopyInstr(*I);
+    if (!DestSrc)
+      return true;
+
+    SrcRegOp = DestSrc->Source;
+    DestRegOp = DestSrc->Destination;
+    if (Reg != DestRegOp->getReg())
+      return true;
+    TrySalvageEntryValue = true;
+  }
+
+  if (TrySalvageEntryValue) {
+    for (unsigned ID : OpenRanges.getVarLocs()) {
+      const VarLoc &VL = VarLocIDs[ID];
+      if (!VL.isEntryBackupLoc())
+        continue;
+
+      if (VL.getEntryValueCopyBackupReg() == Reg &&
+          VL.MI.getOperand(0).getReg() == SrcRegOp->getReg())
+        return false;
+    }
+  }
+
+  return true;
+}
+
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
 void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
@@ -707,18 +785,33 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
   assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
          "Expected inlined-at fields to agree");
 
-  // End all previous ranges of Var.
   DebugVariable V(Var, Expr, InlinedAt);
-  OpenRanges.erase(V);
 
-  // Add the VarLoc to OpenRanges from this DBG_VALUE.
+  // Check if this DBG_VALUE indicates a parameter's value changing.
+  // If that is the case, we should stop tracking its entry value.
+  auto EntryValBackupID = OpenRanges.getEntryValueBackup(V);
+  if (Var->isParameter() && EntryValBackupID) {
+    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
+    if (removeEntryValue(MI, OpenRanges, VarLocIDs, EntryVL)) {
+      LLVM_DEBUG(dbgs() << "Deleting a DBG entry value because of: ";
+                 MI.print(dbgs(), /*IsStandalone*/ false,
+                          /*SkipOpers*/ false, /*SkipDebugLoc*/ false,
+                          /*AddNewLine*/ true, TII));
+      OpenRanges.erase(EntryVL);
+    }
+  }
+
   unsigned ID;
   if (isDbgValueDescribedByReg(MI) || MI.getOperand(0).isImm() ||
       MI.getOperand(0).isFPImm() || MI.getOperand(0).isCImm()) {
     // Use normal VarLoc constructor for registers and immediates.
     VarLoc VL(MI, LS);
+    // End all previous ranges of VL.Var.
+    OpenRanges.erase(VL);
+
     ID = VarLocIDs.insert(VL);
-    OpenRanges.insert(ID, VL.Var);
+    // Add the VarLoc to OpenRanges from this DBG_VALUE.
+    OpenRanges.insert(ID, VL);
   } else if (MI.hasOneMemOperand()) {
     llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
   } else {
@@ -728,32 +821,30 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
   }
 }
 
+/// Turn the entry value backup locations into primary locations.
 void LiveDebugValues::emitEntryValues(MachineInstr &MI,
                                       OpenRangesSet &OpenRanges,
                                       VarLocMap &VarLocIDs,
                                       TransferMap &Transfers,
-                                      DebugParamMap &DebugEntryVals,
                                       SparseBitVector<> &KillSet) {
   for (unsigned ID : KillSet) {
-    if (!VarLocIDs[ID].Var.getVar()->isParameter())
+    if (!VarLocIDs[ID].Var.getVariable()->isParameter())
       continue;
 
-    const MachineInstr *CurrDebugInstr = &VarLocIDs[ID].MI;
+    auto DebugVar = VarLocIDs[ID].Var;
+    auto EntryValBackupID = OpenRanges.getEntryValueBackup(DebugVar);
 
-    // If parameter's DBG_VALUE is not in the map that means we can't
-    // generate parameter's entry value.
-    if (!DebugEntryVals.count(CurrDebugInstr->getDebugVariable()))
+    // If the parameter has the entry value backup, it means we should
+    // be able to use its entry value.
+    if (!EntryValBackupID)
       continue;
 
-    auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()];
-    DIExpression *NewExpr = DIExpression::prepend(
-        ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue);
-
-    VarLoc EntryLoc = VarLoc::CreateEntryLoc(*ParamDebugInstr, LS, NewExpr);
-
-    unsigned EntryValLocID = VarLocIDs.insert(EntryLoc);
-    Transfers.push_back({&MI, EntryValLocID});
-    OpenRanges.insert(EntryValLocID, EntryLoc.Var);
+    const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
+    VarLoc EntryLoc =
+        VarLoc::CreateEntryLoc(EntryVL.MI, LS, EntryVL.Expr, EntryVL.Loc.RegNo);
+    unsigned EntryValueID = VarLocIDs.insert(EntryLoc);
+    Transfers.push_back({&MI, EntryValueID});
+    OpenRanges.insert(EntryValueID, EntryLoc);
   }
 }
 
@@ -768,23 +859,21 @@ void LiveDebugValues::insertTransferDebugPair(
     unsigned NewReg) {
   const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
 
-  auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr,
-                        &VarLocIDs](VarLoc &VL) {
+  auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &VarLocIDs](VarLoc &VL) {
     unsigned LocId = VarLocIDs.insert(VL);
 
     // Close this variable's previous location range.
-    DebugVariable V(*DebugInstr);
-    OpenRanges.erase(V);
+    OpenRanges.erase(VL);
 
     // Record the new location as an open range, and a postponed transfer
     // inserting a DBG_VALUE for this location.
-    OpenRanges.insert(LocId, VL.Var);
+    OpenRanges.insert(LocId, VL);
     TransferDebugPair MIP = {&MI, LocId};
     Transfers.push_back(MIP);
   };
 
-  // End all previous ranges of Var.
-  OpenRanges.erase(VarLocIDs[OldVarID].Var);
+  // End all previous ranges of VL.Var.
+  OpenRanges.erase(VarLocIDs[OldVarID]);
   switch (Kind) {
   case TransferKind::TransferCopy: {
     assert(NewReg &&
@@ -832,7 +921,7 @@ void LiveDebugValues::insertTransferDebugPair(
 /// A definition of a register may mark the end of a range.
 void LiveDebugValues::transferRegisterDef(
     MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
-    TransferMap &Transfers, DebugParamMap &DebugEntryVals) {
+    TransferMap &Transfers) {
   MachineFunction *MF = MI.getMF();
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
@@ -866,8 +955,7 @@ void LiveDebugValues::transferRegisterDef(
   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
     auto &TM = TPC->getTM<TargetMachine>();
     if (TM.Options.EnableDebugEntryValues)
-      emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, DebugEntryVals,
-                      KillSet);
+      emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet);
   }
 }
 
@@ -1005,12 +1093,12 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
     if (TKind == TransferKind::TransferSpill &&
         VarLocIDs[ID].isDescribedByReg() == Reg) {
       LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
-                        << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+                        << VarLocIDs[ID].Var.getVariable()->getName() << ")\n");
     } else if (TKind == TransferKind::TransferRestore &&
                VarLocIDs[ID].Kind == VarLoc::SpillLocKind &&
                VarLocIDs[ID].Loc.SpillLocation == *Loc) {
       LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
-                        << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+                        << VarLocIDs[ID].Var.getVariable()->getName() << ")\n");
     } else
       continue;
     insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID, TKind,
@@ -1026,14 +1114,14 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
                                            OpenRangesSet &OpenRanges,
                                            VarLocMap &VarLocIDs,
                                            TransferMap &Transfers) {
-
   auto DestSrc = TII->isCopyInstr(MI);
   if (!DestSrc)
     return;
 
   const MachineOperand *DestRegOp = DestSrc->Destination;
   const MachineOperand *SrcRegOp = DestSrc->Source;
-  if (!SrcRegOp->isKill() || !DestRegOp->isDef())
+
+  if (!DestRegOp->isDef())
     return;
 
   auto isCalleeSavedReg = [&](unsigned Reg) {
@@ -1054,6 +1142,30 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   if (!isCalleeSavedReg(DestReg))
     return;
 
+  // Remember an entry value movement. If we encounter a new debug value of
+  // a parameter describing only a moving of the value around, rather then
+  // modifying it, we are still able to use the entry value if needed.
+  if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
+    for (unsigned ID : OpenRanges.getVarLocs()) {
+      if (VarLocIDs[ID].getEntryValueBackupReg() == SrcReg) {
+        LLVM_DEBUG(dbgs() << "Copy of the entry value: "; MI.dump(););
+        VarLoc EntryValLocCopyBackup = VarLoc::CreateEntryCopyBackupLoc(
+            VarLocIDs[ID].MI, LS, VarLocIDs[ID].Expr, DestReg);
+
+        // Stop tracking the original entry value.
+        OpenRanges.erase(VarLocIDs[ID]);
+
+        // Start tracking the entry value copy.
+        unsigned EntryValCopyLocID = VarLocIDs.insert(EntryValLocCopyBackup);
+        OpenRanges.insert(EntryValCopyLocID, EntryValLocCopyBackup);
+        break;
+      }
+    }
+  }
+
+  if (!SrcRegOp->isKill())
+    return;
+
   for (unsigned ID : OpenRanges.getVarLocs()) {
     if (VarLocIDs[ID].isDescribedByReg() == SrcReg) {
       insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID,
@@ -1099,26 +1211,27 @@ bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB,
 void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
                                             VarToFragments &SeenFragments,
                                             OverlapMap &OverlappingFragments) {
-  DebugVariable MIVar(MI);
-  FragmentInfo ThisFragment = MIVar.getFragmentDefault();
+  DebugVariable MIVar(MI.getDebugVariable(), MI.getDebugExpression(),
+                      MI.getDebugLoc()->getInlinedAt());
+  FragmentInfo ThisFragment = MIVar.getFragmentOrDefault();
 
   // If this is the first sighting of this variable, then we are guaranteed
   // there are currently no overlapping fragments either. Initialize the set
   // of seen fragments, record no overlaps for the current one, and return.
-  auto SeenIt = SeenFragments.find(MIVar.getVar());
+  auto SeenIt = SeenFragments.find(MIVar.getVariable());
   if (SeenIt == SeenFragments.end()) {
     SmallSet<FragmentInfo, 4> OneFragment;
     OneFragment.insert(ThisFragment);
-    SeenFragments.insert({MIVar.getVar(), OneFragment});
+    SeenFragments.insert({MIVar.getVariable(), OneFragment});
 
-    OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
+    OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
     return;
   }
 
   // If this particular Variable/Fragment pair already exists in the overlap
   // map, it has already been accounted for.
   auto IsInOLapMap =
-      OverlappingFragments.insert({{MIVar.getVar(), ThisFragment}, {}});
+      OverlappingFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
   if (!IsInOLapMap.second)
     return;
 
@@ -1136,7 +1249,7 @@ void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
       // Mark the previously seen fragment as being overlapped by the current
       // one.
       auto ASeenFragmentsOverlaps =
-          OverlappingFragments.find({MIVar.getVar(), ASeenFragment});
+          OverlappingFragments.find({MIVar.getVariable(), ASeenFragment});
       assert(ASeenFragmentsOverlaps != OverlappingFragments.end() &&
              "Previously seen var fragment has no vector of overlaps");
       ASeenFragmentsOverlaps->second.push_back(ThisFragment);
@@ -1148,11 +1261,9 @@ void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
 
 /// This routine creates OpenRanges.
 void LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                              VarLocMap &VarLocIDs, TransferMap &Transfers,
-                              DebugParamMap &DebugEntryVals) {
+                              VarLocMap &VarLocIDs, TransferMap &Transfers) {
   transferDebugValue(MI, OpenRanges, VarLocIDs);
-  transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers,
-                      DebugEntryVals);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers);
   transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
   transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
 }
@@ -1201,7 +1312,7 @@ bool LiveDebugValues::join(
       if (!InLocsT.empty()) {
         for (auto ID : InLocsT)
           dbgs() << "  gathered candidate incoming var: "
-                 << VarLocIDs[ID].Var.getVar()->getName() << "\n";
+                 << VarLocIDs[ID].Var.getVariable()->getName() << "\n";
       }
     });
 
@@ -1216,7 +1327,7 @@ bool LiveDebugValues::join(
       if (!VarLocIDs[ID].dominates(MBB)) {
         KillSet.set(ID);
         LLVM_DEBUG({
-          auto Name = VarLocIDs[ID].Var.getVar()->getName();
+          auto Name = VarLocIDs[ID].Var.getVariable()->getName();
           dbgs() << "  killing " << Name << ", it doesn't dominate MBB\n";
         });
       }
@@ -1273,6 +1384,8 @@ void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
       // The ID location is live-in to MBB -- work out what kind of machine
       // location it is and create a DBG_VALUE.
       const VarLoc &DiffIt = VarLocIDs[ID];
+      if (DiffIt.isEntryBackupLoc())
+        continue;
       MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
       MBB.insert(MBB.instr_begin(), MI);
 
@@ -1284,8 +1397,7 @@ void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
 
 bool LiveDebugValues::isEntryValueCandidate(
     const MachineInstr &MI, const DefinedRegsSet &DefinedRegs) const {
-  if (!MI.isDebugValue())
-    return false;
+  assert(MI.isDebugValue() && "This must be DBG_VALUE.");
 
   // TODO: Add support for local variables that are expressed in terms of
   // parameters entry values.
@@ -1332,6 +1444,37 @@ static void collectRegDefs(const MachineInstr &MI, DefinedRegsSet &Regs,
         Regs.insert(*AI);
 }
 
+/// This routine records the entry values of function parameters. The values
+/// could be used as backup values. If we loose the track of some unmodified
+/// parameters, the backup values will be used as a primary locations.
+void LiveDebugValues::recordEntryValue(const MachineInstr &MI,
+                                       const DefinedRegsSet &DefinedRegs,
+                                       OpenRangesSet &OpenRanges,
+                                       VarLocMap &VarLocIDs) {
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
+    auto &TM = TPC->getTM<TargetMachine>();
+    if (!TM.Options.EnableDebugEntryValues)
+      return;
+  }
+
+  DebugVariable V(MI.getDebugVariable(), MI.getDebugExpression(),
+                  MI.getDebugLoc()->getInlinedAt());
+
+  if (!isEntryValueCandidate(MI, DefinedRegs) ||
+      OpenRanges.getEntryValueBackup(V))
+    return;
+
+  LLVM_DEBUG(dbgs() << "Creating the backup entry location: "; MI.dump(););
+
+  // Create the entry value and use it as a backup location until it is
+  // valid. It is valid until a parameter is not changed.
+  DIExpression *NewExpr =
+      DIExpression::prepend(MI.getDebugExpression(), DIExpression::EntryValue);
+  VarLoc EntryValLocAsBackup = VarLoc::CreateEntryBackupLoc(MI, LS, NewExpr);
+  unsigned EntryValLocID = VarLocIDs.insert(EntryValLocAsBackup);
+  OpenRanges.insert(EntryValLocID, EntryValLocAsBackup);
+}
+
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
 bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
@@ -1368,23 +1511,17 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
                       std::greater<unsigned int>>
       Pending;
 
-  // Working set of currently collected debug variables mapped to DBG_VALUEs
-  // representing candidates for production of debug entry values.
-  DebugParamMap DebugEntryVals;
-
   // Set of register defines that are seen when traversing the entry block
   // looking for debug entry value candidates.
   DefinedRegsSet DefinedRegs;
 
   // Only in the case of entry MBB collect DBG_VALUEs representing
   // function parameters in order to generate debug entry values for them.
-
   MachineBasicBlock &First_MBB = *(MF.begin());
   for (auto &MI : First_MBB) {
     collectRegDefs(MI, DefinedRegs, TRI);
-    if (isEntryValueCandidate(MI, DefinedRegs) &&
-        !DebugEntryVals.count(MI.getDebugVariable()))
-      DebugEntryVals[MI.getDebugVariable()] = &MI;
+      if (MI.isDebugValue())
+        recordEntryValue(MI, DefinedRegs, OpenRanges, VarLocIDs);
   }
 
   // Initialize per-block structures and scan for fragment overlaps.
@@ -1443,7 +1580,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
         // First load any pending inlocs.
         OpenRanges.insertFromLocSet(PendingInLocs[MBB], VarLocIDs);
         for (auto &MI : *MBB)
-          process(MI, OpenRanges, VarLocIDs, Transfers, DebugEntryVals);
+          process(MI, OpenRanges, VarLocIDs, Transfers);
         OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
 
         LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 600e7880c702b..9c80282bc59eb 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -1072,9 +1072,9 @@ class LiveIntervals::HMEditor {
       // Kill flags shouldn't be used while live intervals exist, they will be
       // reinserted by VirtRegRewriter.
       if (MachineInstr *KillMI = LIS.getInstructionFromIndex(OldIdxIn->end))
-        for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
-          if (MO->isReg() && MO->isUse())
-            MO->setIsKill(false);
+        for (MachineOperand &MOP : mi_bundle_ops(*KillMI))
+          if (MOP.isReg() && MOP.isUse())
+            MOP.setIsKill(false);
 
       // Is there a def before NewIdx which is not OldIdx?
       LiveRange::iterator Next = std::next(OldIdxIn);
diff --git a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 7a57cd6890d10..5ef907b883155 100644
--- a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -49,10 +49,6 @@ static cl::opt<unsigned>
                                cl::value_desc("N"),
                                cl::desc("Function number to canonicalize."));
 
-static cl::opt<unsigned> CanonicalizeBasicBlockNumber(
-    "canon-nth-basicblock", cl::Hidden, cl::init(~0u), cl::value_desc("N"),
-    cl::desc("BasicBlock number to canonicalize."));
-
 namespace {
 
 class MIRCanonicalizer : public MachineFunctionPass {
@@ -374,24 +370,7 @@ static bool doDefKillClear(MachineBasicBlock *MBB) {
 }
 
 static bool runOnBasicBlock(MachineBasicBlock *MBB,
-                            std::vector<StringRef> &bbNames,
-                            unsigned &basicBlockNum, VRegRenamer &Renamer) {
-
-  if (CanonicalizeBasicBlockNumber != ~0U) {
-    if (CanonicalizeBasicBlockNumber != basicBlockNum++)
-      return false;
-    LLVM_DEBUG(dbgs() << "\n Canonicalizing BasicBlock " << MBB->getName()
-                      << "\n";);
-  }
-
-  if (llvm::find(bbNames, MBB->getName()) != bbNames.end()) {
-    LLVM_DEBUG({
-      dbgs() << "Found potentially duplicate BasicBlocks: " << MBB->getName()
-             << "\n";
-    });
-    return false;
-  }
-
+                            unsigned BasicBlockNum, VRegRenamer &Renamer) {
   LLVM_DEBUG({
     dbgs() << "\n\n  NEW BASIC BLOCK: " << MBB->getName() << "  \n\n";
     dbgs() << "\n\n================================================\n\n";
@@ -399,7 +378,6 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
 
   bool Changed = false;
 
-  bbNames.push_back(MBB->getName());
   LLVM_DEBUG(dbgs() << "\n\n NEW BASIC BLOCK: " << MBB->getName() << "\n\n";);
 
   LLVM_DEBUG(dbgs() << "MBB Before Canonical Copy Propagation:\n";
@@ -412,8 +390,10 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
   Changed |= rescheduleCanonically(IdempotentInstCount, MBB);
   LLVM_DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
 
-  Changed |= Renamer.renameVRegs(MBB);
+  Changed |= Renamer.renameVRegs(MBB, BasicBlockNum);
 
+  // TODO: Consider dropping this. Dropping kill defs is probably not
+  // semantically sound.
   Changed |= doDefKillClear(MBB);
 
   LLVM_DEBUG(dbgs() << "Updated MachineBasicBlock:\n"; MBB->dump();
@@ -445,16 +425,12 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
            : RPOList) { dbgs() << MBB->getName() << "\n"; } dbgs()
       << "\n\n================================================\n\n";);
 
-  std::vector<StringRef> BBNames;
-
   unsigned BBNum = 0;
-
   bool Changed = false;
-
   MachineRegisterInfo &MRI = MF.getRegInfo();
   VRegRenamer Renamer(MRI);
   for (auto MBB : RPOList)
-    Changed |= runOnBasicBlock(MBB, BBNames, BBNum, Renamer);
+    Changed |= runOnBasicBlock(MBB, BBNum++, Renamer);
 
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index 62d0f2e52c7d2..9f61dd9ef243a 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -57,9 +57,10 @@ class MIRNamer : public MachineFunctionPass {
 
     VRegRenamer Renamer(MF.getRegInfo());
 
+    unsigned BBIndex = 0;
     ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
     for (auto &MBB : RPOT)
-      Changed |= Renamer.renameVRegs(MBB);
+      Changed |= Renamer.renameVRegs(MBB, BBIndex++);
 
     return Changed;
   }
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.h b/llvm/lib/CodeGen/MIRVRegNamerUtils.h
index ebe309757f27c..8e76bfa2bbd44 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.h
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.h
@@ -84,7 +84,7 @@ class VRegRenamer {
 
   /// Same as the above, but sets a BBNum depending on BB traversal that
   /// will be used as prefix for the vreg names.
-  bool renameVRegs(MachineBasicBlock *MBB, unsigned BBNum = 0);
+  bool renameVRegs(MachineBasicBlock *MBB, unsigned BBNum);
 
   unsigned getCurrentBBNumber() const { return CurrentBBNumber; }
 };
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 71354ea43453e..f433c4b6c90b5 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1395,8 +1395,7 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
 
     --N;
 
-    MachineOperandIteratorBase::PhysRegInfo Info =
-        ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
+    PhysRegInfo Info = AnalyzePhysRegInBundle(*I, Reg, TRI);
 
     // Register is live when we read it here.
     if (Info.Read)
@@ -1434,8 +1433,7 @@ MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
 
       --N;
 
-      MachineOperandIteratorBase::PhysRegInfo Info =
-          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
+      PhysRegInfo Info = AnalyzePhysRegInBundle(*I, Reg, TRI);
 
       // Defs happen after uses so they take precedence if both are present.
 
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 34ece614185c3..6db388c2564a2 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -37,6 +37,15 @@
 //    ... // No clobber of %R0
 //    %R1 = COPY %R0 <<< Removed
 //
+// or
+//
+//    $R0 = OP ...
+//    ... // No read/clobber of $R0 and $R1
+//    $R1 = COPY $R0 // $R0 is killed
+// Replace $R0 with $R1 and remove the COPY
+//    $R1 = OP ...
+//    ...
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
@@ -98,6 +107,28 @@ class CopyTracker {
     }
   }
 
+  /// Remove register from copy maps.
+  void invalidateRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
+    // Since Reg might be a subreg of some registers, only invalidate Reg is not
+    // enough. We have to find the COPY defines Reg or registers defined by Reg
+    // and invalidate all of them.
+    DenseSet<unsigned> RegsToInvalidate{Reg};
+    for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
+      auto I = Copies.find(*RUI);
+      if (I != Copies.end()) {
+        if (MachineInstr *MI = I->second.MI) {
+          RegsToInvalidate.insert(MI->getOperand(0).getReg());
+          RegsToInvalidate.insert(MI->getOperand(1).getReg());
+        }
+        RegsToInvalidate.insert(I->second.DefRegs.begin(),
+                                I->second.DefRegs.end());
+      }
+    }
+    for (unsigned InvalidReg : RegsToInvalidate)
+      for (MCRegUnitIterator RUI(InvalidReg, &TRI); RUI.isValid(); ++RUI)
+        Copies.erase(*RUI);
+  }
+
   /// Clobber a single register, removing it from the tracker's copy maps.
   void clobberRegister(unsigned Reg, const TargetRegisterInfo &TRI) {
     for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
@@ -151,6 +182,38 @@ class CopyTracker {
     return CI->second.MI;
   }
 
+  MachineInstr *findCopyDefViaUnit(unsigned RegUnit,
+                                    const TargetRegisterInfo &TRI) {
+    auto CI = Copies.find(RegUnit);
+    if (CI == Copies.end())
+      return nullptr;
+    if (CI->second.DefRegs.size() != 1)
+      return nullptr;
+    MCRegUnitIterator RUI(CI->second.DefRegs[0], &TRI);
+    return findCopyForUnit(*RUI, TRI, true);
+  }
+
+  MachineInstr *findAvailBackwardCopy(MachineInstr &I, unsigned Reg,
+                                      const TargetRegisterInfo &TRI) {
+    MCRegUnitIterator RUI(Reg, &TRI);
+    MachineInstr *AvailCopy = findCopyDefViaUnit(*RUI, TRI);
+    if (!AvailCopy ||
+        !TRI.isSubRegisterEq(AvailCopy->getOperand(1).getReg(), Reg))
+      return nullptr;
+
+    Register AvailSrc = AvailCopy->getOperand(1).getReg();
+    Register AvailDef = AvailCopy->getOperand(0).getReg();
+    for (const MachineInstr &MI :
+         make_range(AvailCopy->getReverseIterator(), I.getReverseIterator()))
+      for (const MachineOperand &MO : MI.operands())
+        if (MO.isRegMask())
+          // FIXME: Shall we simultaneously invalidate AvailSrc or AvailDef?
+          if (MO.clobbersPhysReg(AvailSrc) || MO.clobbersPhysReg(AvailDef))
+            return nullptr;
+
+    return AvailCopy;
+  }
+
   MachineInstr *findAvailCopy(MachineInstr &DestCopy, unsigned Reg,
                               const TargetRegisterInfo &TRI) {
     // We check the first RegUnit here, since we'll only be interested in the
@@ -211,11 +274,16 @@ class MachineCopyPropagation : public MachineFunctionPass {
   void ClobberRegister(unsigned Reg);
   void ReadRegister(unsigned Reg, MachineInstr &Reader,
                     DebugType DT);
-  void CopyPropagateBlock(MachineBasicBlock &MBB);
+  void ForwardCopyPropagateBlock(MachineBasicBlock &MBB);
+  void BackwardCopyPropagateBlock(MachineBasicBlock &MBB);
   bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
   void forwardUses(MachineInstr &MI);
+  void propagateDefs(MachineInstr &MI);
   bool isForwardableRegClassCopy(const MachineInstr &Copy,
                                  const MachineInstr &UseI, unsigned UseIdx);
+  bool isBackwardPropagatableRegClassCopy(const MachineInstr &Copy,
+                                          const MachineInstr &UseI,
+                                          unsigned UseIdx);
   bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
 
   /// Candidates for deletion.
@@ -313,6 +381,19 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
   return true;
 }
 
+bool MachineCopyPropagation::isBackwardPropagatableRegClassCopy(
+    const MachineInstr &Copy, const MachineInstr &UseI, unsigned UseIdx) {
+  Register Def = Copy.getOperand(0).getReg();
+
+  if (const TargetRegisterClass *URC =
+          UseI.getRegClassConstraint(UseIdx, TII, TRI))
+    return URC->contains(Def);
+
+  // We don't process further if UseI is a COPY, since forward copy propagation
+  // should handle that.
+  return false;
+}
+
 /// Decide whether we should forward the source of \param Copy to its use in
 /// \param UseI based on the physical register class constraints of the opcode
 /// and avoiding introducing more cross-class COPYs.
@@ -468,8 +549,9 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
   }
 }
 
-void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
-  LLVM_DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
+void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "MCP: ForwardCopyPropagateBlock " << MBB.getName()
+                    << "\n");
 
   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
     MachineInstr *MI = &*I;
@@ -647,6 +729,134 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
   Tracker.clear();
 }
 
+static bool isBackwardPropagatableCopy(MachineInstr &MI,
+                                       const MachineRegisterInfo &MRI) {
+  assert(MI.isCopy() && "MI is expected to be a COPY");
+  Register Def = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  if (!Def || !Src)
+    return false;
+
+  if (MRI.isReserved(Def) || MRI.isReserved(Src))
+    return false;
+
+  return MI.getOperand(1).isRenamable() && MI.getOperand(1).isKill();
+}
+
+void MachineCopyPropagation::propagateDefs(MachineInstr &MI) {
+  if (!Tracker.hasAnyCopies())
+    return;
+
+  for (unsigned OpIdx = 0, OpEnd = MI.getNumOperands(); OpIdx != OpEnd;
+       ++OpIdx) {
+    MachineOperand &MODef = MI.getOperand(OpIdx);
+
+    if (!MODef.isReg() || MODef.isUse())
+      continue;
+
+    // Ignore non-trivial cases.
+    if (MODef.isTied() || MODef.isUndef() || MODef.isImplicit())
+      continue;
+
+    if (!MODef.getReg())
+      continue;
+
+    // We only handle if the register comes from a vreg.
+    if (!MODef.isRenamable())
+      continue;
+
+    MachineInstr *Copy =
+        Tracker.findAvailBackwardCopy(MI, MODef.getReg(), *TRI);
+    if (!Copy)
+      continue;
+
+    Register Def = Copy->getOperand(0).getReg();
+    Register Src = Copy->getOperand(1).getReg();
+
+    if (MODef.getReg() != Src)
+      continue;
+
+    if (!isBackwardPropagatableRegClassCopy(*Copy, MI, OpIdx))
+      continue;
+
+    if (hasImplicitOverlap(MI, MODef))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "MCP: Replacing " << printReg(MODef.getReg(), TRI)
+                      << "\n     with " << printReg(Def, TRI) << "\n     in "
+                      << MI << "     from " << *Copy);
+
+    MODef.setReg(Def);
+    MODef.setIsRenamable(Copy->getOperand(0).isRenamable());
+
+    LLVM_DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
+    MaybeDeadCopies.insert(Copy);
+    Changed = true;
+  }
+}
+
+void MachineCopyPropagation::BackwardCopyPropagateBlock(
+    MachineBasicBlock &MBB) {
+  LLVM_DEBUG(dbgs() << "MCP: BackwardCopyPropagateBlock " << MBB.getName()
+                    << "\n");
+
+  for (MachineBasicBlock::reverse_iterator I = MBB.rbegin(), E = MBB.rend();
+       I != E;) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    // Ignore non-trivial COPYs.
+    if (MI->isCopy() && MI->getNumOperands() == 2 &&
+        !TRI->regsOverlap(MI->getOperand(0).getReg(),
+                          MI->getOperand(1).getReg())) {
+
+      Register Def = MI->getOperand(0).getReg();
+      Register Src = MI->getOperand(1).getReg();
+
+      // Unlike forward cp, we don't invoke propagateDefs here,
+      // just let forward cp do COPY-to-COPY propagation.
+      if (isBackwardPropagatableCopy(*MI, *MRI)) {
+        Tracker.invalidateRegister(Src, *TRI);
+        Tracker.invalidateRegister(Def, *TRI);
+        Tracker.trackCopy(MI, *TRI);
+        continue;
+      }
+    }
+
+    // Invalidate any earlyclobber regs first.
+    for (const MachineOperand &MO : MI->operands())
+      if (MO.isReg() && MO.isEarlyClobber()) {
+        Register Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        Tracker.invalidateRegister(Reg, *TRI);
+      }
+
+    propagateDefs(*MI);
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg())
+        continue;
+
+      if (!MO.getReg())
+        continue;
+
+      if (MO.isDef())
+        Tracker.invalidateRegister(MO.getReg(), *TRI);
+
+      if (MO.readsReg())
+        Tracker.invalidateRegister(MO.getReg(), *TRI);
+    }
+  }
+
+  for (auto *Copy : MaybeDeadCopies)
+    Copy->eraseFromParent();
+
+  MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
+  Tracker.clear();
+}
+
 bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -657,8 +867,10 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
 
-  for (MachineBasicBlock &MBB : MF)
-    CopyPropagateBlock(MBB);
+  for (MachineBasicBlock &MBB : MF) {
+    BackwardCopyPropagateBlock(MBB);
+    ForwardCopyPropagateBlock(MBB);
+  }
 
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 18df5c69a22d9..94865b0e9031c 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -278,22 +278,18 @@ bool llvm::finalizeBundles(MachineFunction &MF) {
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-// MachineOperand iterator
-//===----------------------------------------------------------------------===//
-
-MachineOperandIteratorBase::VirtRegInfo
-MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
-                    SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops) {
-  VirtRegInfo RI = { false, false, false };
-  for(; isValid(); ++*this) {
-    MachineOperand &MO = deref();
+VirtRegInfo llvm::AnalyzeVirtRegInBundle(
+    MachineInstr &MI, unsigned Reg,
+    SmallVectorImpl<std::pair<MachineInstr *, unsigned>> *Ops) {
+  VirtRegInfo RI = {false, false, false};
+  for (MIBundleOperands O(MI); O.isValid(); ++O) {
+    MachineOperand &MO = *O;
     if (!MO.isReg() || MO.getReg() != Reg)
       continue;
 
     // Remember each (MI, OpNo) that refers to Reg.
     if (Ops)
-      Ops->push_back(std::make_pair(MO.getParent(), getOperandNo()));
+      Ops->push_back(std::make_pair(MO.getParent(), O.getOperandNo()));
 
     // Both defs and uses can read virtual registers.
     if (MO.readsReg()) {
@@ -305,22 +301,22 @@ MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
     // Only defs can write.
     if (MO.isDef())
       RI.Writes = true;
-    else if (!RI.Tied && MO.getParent()->isRegTiedToDefOperand(getOperandNo()))
+    else if (!RI.Tied &&
+             MO.getParent()->isRegTiedToDefOperand(O.getOperandNo()))
       RI.Tied = true;
   }
   return RI;
 }
 
-MachineOperandIteratorBase::PhysRegInfo
-MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
-                                           const TargetRegisterInfo *TRI) {
+PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, unsigned Reg,
+                                         const TargetRegisterInfo *TRI) {
   bool AllDefsDead = true;
   PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
 
   assert(Register::isPhysicalRegister(Reg) &&
          "analyzePhysReg not given a physical register!");
-  for (; isValid(); ++*this) {
-    MachineOperand &MO = deref();
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    const MachineOperand &MO = *O;
 
     if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) {
       PRI.Clobbered = true;
diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp
index 0c2ef3321e0a8..aff67f9cfd55f 100644
--- a/llvm/lib/CodeGen/MachineSizeOpts.cpp
+++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp
@@ -107,14 +107,16 @@ struct MachineBasicBlockBFIAdapter {
 
 bool llvm::shouldOptimizeForSize(const MachineFunction *MF,
                                  ProfileSummaryInfo *PSI,
-                                 const MachineBlockFrequencyInfo *MBFI) {
+                                 const MachineBlockFrequencyInfo *MBFI,
+                                 PGSOQueryType QueryType) {
   return shouldFuncOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
-      MF, PSI, MBFI);
+      MF, PSI, MBFI, QueryType);
 }
 
 bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
                                  ProfileSummaryInfo *PSI,
-                                 const MachineBlockFrequencyInfo *MBFI) {
+                                 const MachineBlockFrequencyInfo *MBFI,
+                                 PGSOQueryType QueryType) {
   return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
-      MBB, PSI, MBFI);
+      MBB, PSI, MBFI, QueryType);
 }
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6d88aae70af39..ca57e51268e88 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -124,8 +124,8 @@ namespace {
     void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
       RV.push_back(Reg);
       if (Register::isPhysicalRegister(Reg))
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          RV.push_back(*SubRegs);
+        for (const MCPhysReg &SubReg : TRI->subregs(Reg))
+          RV.push_back(SubReg);
     }
 
     struct BBInfo {
@@ -802,18 +802,16 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         report("MBB live-in list contains non-physical register", MBB);
         continue;
       }
-      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        regsLive.insert(*SubRegs);
+      for (const MCPhysReg &SubReg : TRI->subregs_inclusive(LI.PhysReg))
+        regsLive.insert(SubReg);
     }
   }
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   BitVector PR = MFI.getPristineRegs(*MF);
   for (unsigned I : PR.set_bits()) {
-    for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true);
-         SubRegs.isValid(); ++SubRegs)
-      regsLive.insert(*SubRegs);
+    for (const MCPhysReg &SubReg : TRI->subregs_inclusive(I))
+      regsLive.insert(SubReg);
   }
 
   regsKilled.clear();
@@ -1610,13 +1608,23 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   } else if (MONum < MCID.getNumOperands()) {
     const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
     // Don't check if it's the last operand in a variadic instruction. See,
-    // e.g., LDM_RET in the arm back end.
-    if (MO->isReg() &&
-        !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
-      if (MO->isDef() && !MCOI.isOptionalDef())
-        report("Explicit operand marked as def", MO, MONum);
-      if (MO->isImplicit())
-        report("Explicit operand marked as implicit", MO, MONum);
+    // e.g., LDM_RET in the arm back end. Check non-variadic operands only.
+    bool IsOptional = MI->isVariadic() && MONum == MCID.getNumOperands() - 1;
+    if (!IsOptional) {
+      if (MO->isReg()) {
+        if (MO->isDef() && !MCOI.isOptionalDef())
+          report("Explicit operand marked as def", MO, MONum);
+        if (MO->isImplicit())
+          report("Explicit operand marked as implicit", MO, MONum);
+      }
+
+      // Check that an instruction has register operands only as expected.
+      if (MCOI.OperandType == MCOI::OPERAND_REGISTER &&
+          !MO->isReg() && !MO->isFI())
+        report("Expected a register operand.", MO, MONum);
+      if ((MCOI.OperandType == MCOI::OPERAND_IMMEDIATE ||
+           MCOI.OperandType == MCOI::OPERAND_PCREL) && MO->isReg())
+        report("Expected a non-register operand.", MO, MONum);
     }
 
     int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
@@ -2006,9 +2014,9 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         bool Bad = !isReserved(Reg);
         // We are fine if just any subregister has a defined value.
         if (Bad) {
-          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid();
-               ++SubRegs) {
-            if (regsLive.count(*SubRegs)) {
+
+          for (const MCPhysReg &SubReg : TRI->subregs(Reg)) {
+            if (regsLive.count(SubReg)) {
               Bad = false;
               break;
             }
@@ -2026,9 +2034,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             if (!Register::isPhysicalRegister(MOP.getReg()))
               continue;
 
-            for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
-                 ++SubRegs) {
-              if (*SubRegs == Reg) {
+            for (const MCPhysReg &SubReg : TRI->subregs(MOP.getReg())) {
+              if (SubReg == Reg) {
                 Bad = false;
                 break;
               }
diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp
index d21eae222af03..26cbc14166be4 100644
--- a/llvm/lib/CodeGen/MacroFusion.cpp
+++ b/llvm/lib/CodeGen/MacroFusion.cpp
@@ -36,6 +36,21 @@ static bool isHazard(const SDep &Dep) {
   return Dep.getKind() == SDep::Anti || Dep.getKind() == SDep::Output;
 }
 
+static SUnit *getPredClusterSU(const SUnit &SU) {
+  for (const SDep &SI : SU.Preds)
+    if (SI.isCluster())
+      return SI.getSUnit();
+
+  return nullptr;
+}
+
+static bool hasLessThanNumFused(const SUnit &SU, unsigned FuseLimit) {
+  unsigned Num = 1;
+  const SUnit *CurrentSU = &SU;
+  while ((CurrentSU = getPredClusterSU(*CurrentSU)) && Num < FuseLimit) Num ++;
+  return Num < FuseLimit;
+}
+
 static bool fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU,
                                 SUnit &SecondSU) {
   // Check that neither instr is already paired with another along the edge
@@ -161,8 +176,10 @@ bool MacroFusion::scheduleAdjacentImpl(ScheduleDAGInstrs &DAG, SUnit &AnchorSU)
     if (DepSU.isBoundaryNode())
       continue;
 
+    // Only chain two instructions together at most.
     const MachineInstr *DepMI = DepSU.getInstr();
-    if (!shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI))
+    if (!hasLessThanNumFused(DepSU, 2) ||
+        !shouldScheduleAdjacent(TII, ST, DepMI, AnchorMI))
       continue;
 
     if (fuseInstructionPair(DAG, DepSU, AnchorSU))
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 2850033e64196..ad7f910be4c52 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -189,7 +190,85 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) {
   return LatestDef;
 }
 
+MachineInstr* ReachingDefAnalysis::getReachingMIDef(MachineInstr *MI, int PhysReg) {
+  return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg));
+}
+
+bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
+                                             int PhysReg) {
+  MachineBasicBlock *ParentA = A->getParent();
+  MachineBasicBlock *ParentB = B->getParent();
+  if (ParentA != ParentB)
+    return false;
+
+  return getReachingDef(A, PhysReg) == getReachingDef(B, PhysReg);
+}
+
+MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
+                                                 int InstId) {
+  assert(static_cast<size_t>(MBB->getNumber()) < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+  assert(InstId < static_cast<int>(MBB->size()) &&
+         "Unexpected instruction id.");
+
+  if (InstId < 0)
+    return nullptr;
+
+  for (auto &MI : *MBB) {
+    if (InstIds.count(&MI) && InstIds[&MI] == InstId)
+      return &MI;
+  }
+  return nullptr;
+}
+
 int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
   return InstIds[MI] - getReachingDef(MI, PhysReg);
 }
+
+void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
+                                        SmallVectorImpl<MachineInstr*> &Uses) {
+  MachineBasicBlock *MBB = Def->getParent();
+  MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
+  while (++MI != MBB->end()) {
+    for (auto &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
+        continue;
+
+      // If/when we find a new reaching def, we know that there's no more uses
+      // of 'Def'.
+      if (getReachingMIDef(&*MI, PhysReg) != Def)
+        return;
+
+      Uses.push_back(&*MI);
+      if (MO.isKill())
+        return;
+    }
+  }
+}
+
+unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) {
+  SmallVector<MachineInstr*, 4> Uses;
+  getReachingLocalUses(Def, PhysReg, Uses);
+  return Uses.size();
+}
+
+bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
+  MachineBasicBlock *MBB = MI->getParent();
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+
+  // Yes if the register is live out of the basic block.
+  if (LiveRegs.contains(PhysReg))
+    return true;
+
+  // Walk backwards through the block to see if the register is live at some
+  // point.
+  for (auto Last = MBB->rbegin(), End = MBB->rend(); Last != End; ++Last) {
+    LiveRegs.stepBackward(*Last);
+    if (LiveRegs.contains(PhysReg))
+      return InstIds[&*Last] > InstIds[MI];
+  }
+  return false;
+}
+
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index 270209293f6a4..a5bea1463468a 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -221,8 +221,8 @@ void RegScavenger::forward() {
         // Ideally we would like a way to model this, but leaving the
         // insert_subreg around causes both correctness and performance issues.
         bool SubUsed = false;
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-          if (isRegUsed(*SubRegs)) {
+        for (const MCPhysReg &SubReg : TRI->subregs(Reg))
+          if (isRegUsed(SubReg)) {
             SubUsed = true;
             break;
           }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 793352c16d35a..2c2f8fea97900 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -131,6 +131,7 @@ namespace {
     const TargetLowering &TLI;
     CombineLevel Level;
     CodeGenOpt::Level OptLevel;
+    bool LegalDAG = false;
     bool LegalOperations = false;
     bool LegalTypes = false;
     bool ForCodeSize;
@@ -179,6 +180,12 @@ namespace {
         AddToWorklist(Node);
     }
 
+    /// Convenient shorthand to add a node and all of its user to the worklist.
+    void AddToWorklistWithUsers(SDNode *N) {
+      AddUsersToWorklist(N);
+      AddToWorklist(N);
+    }
+
     // Prune potentially dangling nodes. This is called after
     // any visit to a node, but should also be called during a visit after any
     // failed combine which may have created a DAG node.
@@ -1395,6 +1402,7 @@ bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
 void DAGCombiner::Run(CombineLevel AtLevel) {
   // set the instance variables, so that the various visit routines may use it.
   Level = AtLevel;
+  LegalDAG = Level >= AfterLegalizeDAG;
   LegalOperations = Level >= AfterLegalizeVectorOps;
   LegalTypes = Level >= AfterLegalizeTypes;
 
@@ -1421,14 +1429,13 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
 
     // If this combine is running after legalizing the DAG, re-legalize any
     // nodes pulled off the worklist.
-    if (Level == AfterLegalizeDAG) {
+    if (LegalDAG) {
       SmallSetVector<SDNode *, 16> UpdatedNodes;
       bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
 
-      for (SDNode *LN : UpdatedNodes) {
-        AddUsersToWorklist(LN);
-        AddToWorklist(LN);
-      }
+      for (SDNode *LN : UpdatedNodes)
+        AddToWorklistWithUsers(LN);
+
       if (!NIsValid)
         continue;
     }
@@ -5332,7 +5339,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
-  if (Level >= AfterLegalizeTypes) {
+  if (LegalTypes) {
     // Attempt to propagate the AND back up to the leaves which, if they're
     // loads, can be combined to narrow loads and the AND node can be removed.
     // Perform after legalization so that extend nodes will already be
@@ -8724,6 +8731,10 @@ SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -8748,6 +8759,10 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MLD->getPassThru(), MLD->getChain());
 
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -9506,11 +9521,10 @@ static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
 
   SDLoc dl(Ld);
   SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
-  SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(),
-                                      Ld->getBasePtr(), Ld->getMask(),
-                                      PassThru, Ld->getMemoryVT(),
-                                      Ld->getMemOperand(), ExtLoadType,
-                                      Ld->isExpandingLoad());
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
+      PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
+      ExtLoadType, Ld->isExpandingLoad());
   DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
   return NewLoad;
 }
@@ -13357,9 +13371,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
     if (CFP1) {
       APFloat CVal = CFP1->getValueAPF();
       CVal.changeSign();
-      if (Level >= AfterLegalizeDAG &&
-          (TLI.isFPImmLegal(CVal, VT, ForCodeSize) ||
-           TLI.isOperationLegal(ISD::ConstantFP, VT)))
+      if (LegalDAG && (TLI.isFPImmLegal(CVal, VT, ForCodeSize) ||
+                       TLI.isOperationLegal(ISD::ConstantFP, VT)))
         return DAG.getNode(
             ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
             DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
@@ -13612,12 +13625,22 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
   EVT VT;
   unsigned AS;
 
-  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(Use)) {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
     if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
       return false;
     VT = LD->getMemoryVT();
     AS = LD->getAddressSpace();
-  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(Use)) {
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
     if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
       return false;
     VT = ST->getMemoryVT();
@@ -13651,38 +13674,64 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
                                    VT.getTypeForEVT(*DAG.getContext()), AS);
 }
 
-/// Try turning a load/store into a pre-indexed load/store when the base
-/// pointer is an add or subtract and it has other uses besides the load/store.
-/// After the transformation, the new indexed load/store has effectively folded
-/// the add/subtract in and all of its other uses are redirected to the
-/// new load/store.
-bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
-  if (Level < AfterLegalizeDAG)
-    return false;
-
-  bool isLoad = true;
-  SDValue Ptr;
-  EVT VT;
-  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
+                                     bool &IsLoad, bool &IsMasked, SDValue &Ptr,
+                                     const TargetLowering &TLI) {
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     if (LD->isIndexed())
       return false;
-    VT = LD->getMemoryVT();
-    if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
-        !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+    EVT VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
       return false;
     Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     if (ST->isIndexed())
       return false;
-    VT = ST->getMemoryVT();
-    if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
-        !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+    EVT VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    IsLoad = false;
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    EVT VT = LD->getMemoryVT();
+    if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
+        !TLI.isIndexedMaskedLoadLegal(Dec, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+    IsMasked = true;
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    EVT VT = ST->getMemoryVT();
+    if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
+        !TLI.isIndexedMaskedStoreLegal(Dec, VT))
       return false;
     Ptr = ST->getBasePtr();
-    isLoad = false;
+    IsLoad = false;
+    IsMasked = true;
   } else {
     return false;
   }
+  return true;
+}
+
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool IsLoad = true;
+  bool IsMasked = false;
+  SDValue Ptr;
+  if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
+                                Ptr, TLI))
+    return false;
 
   // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
   // out.  There is no reason to make this a preinc/predec.
@@ -13724,8 +13773,9 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     return false;
 
   // Check #2.
-  if (!isLoad) {
-    SDValue Val = cast<StoreSDNode>(N)->getValue();
+  if (!IsLoad) {
+    SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
+                           : cast<StoreSDNode>(N)->getValue();
 
     // Would require a copy.
     if (Val == BasePtr)
@@ -13801,18 +13851,26 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     return false;
 
   SDValue Result;
-  if (isLoad)
-    Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
-                                BasePtr, Offset, AM);
-  else
-    Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
-                                 BasePtr, Offset, AM);
+  if (!IsMasked) {
+    if (IsLoad)
+      Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+    else
+      Result =
+          DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
+  } else {
+    if (IsLoad)
+      Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                        Offset, AM);
+    else
+      Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
+                                         Offset, AM);
+  }
   ++PreIndexedNodes;
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
              Result.getNode()->dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
-  if (isLoad) {
+  if (IsLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
   } else {
@@ -13866,7 +13924,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
 
     // We can now generate the new expression.
     SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
-    SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);
+    SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
 
     SDValue NewUse = DAG.getNode(Opcode,
                                  DL,
@@ -13876,7 +13934,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   }
 
   // Replace the uses of Ptr with uses of the updated base value.
-  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
   deleteAndRecombine(Ptr.getNode());
   AddToWorklist(Result.getNode());
 
@@ -13891,29 +13949,12 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
 
-  bool isLoad = true;
+  bool IsLoad = true;
+  bool IsMasked = false;
   SDValue Ptr;
-  EVT VT;
-  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
-    if (LD->isIndexed())
-      return false;
-    VT = LD->getMemoryVT();
-    if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
-        !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
-      return false;
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
-    if (ST->isIndexed())
-      return false;
-    VT = ST->getMemoryVT();
-    if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
-        !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
-      return false;
-    Ptr = ST->getBasePtr();
-    isLoad = false;
-  } else {
+  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked,
+                                Ptr, TLI))
     return false;
-  }
 
   if (Ptr.getNode()->hasOneUse())
     return false;
@@ -13949,7 +13990,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
 
         // If all the uses are load / store addresses, then don't do the
         // transformation.
-        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
           bool RealUse = false;
           for (SDNode *UseUse : Use->uses()) {
             if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
@@ -13975,18 +14016,24 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
       Worklist.push_back(Op);
       if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
           !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) {
-        SDValue Result = isLoad
-          ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
-                               BasePtr, Offset, AM)
-          : DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
-                                BasePtr, Offset, AM);
+        SDValue Result;
+        if (!IsMasked)
+          Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                               Offset, AM)
+                          : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+                                                BasePtr, Offset, AM);
+        else
+          Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
+                                                     BasePtr, Offset, AM)
+                          : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
+                                                      BasePtr, Offset, AM);
         ++PostIndexedNodes;
         ++NodesCombined;
         LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
                    dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
                    dbgs() << '\n');
         WorklistRemover DeadNodes(*this);
-        if (isLoad) {
+        if (IsLoad) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
         } else {
@@ -13998,7 +14045,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
-                                      Result.getValue(isLoad ? 1 : 0));
+                                      Result.getValue(IsLoad ? 1 : 0));
         deleteAndRecombine(Op);
         return true;
       }
@@ -16655,11 +16702,15 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
 
 /// Convert a disguised subvector insertion into a shuffle:
 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
+  assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
+         "Expected extract_vector_elt");
   SDValue InsertVal = N->getOperand(1);
   SDValue Vec = N->getOperand(0);
 
-  // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), InsIndex)
-  //   --> (vector_shuffle X, Y)
+  // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
+  // InsIndex)
+  //   --> (vector_shuffle X, Y) and variations where shuffle operands may be
+  //   CONCAT_VECTORS.
   if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
       InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       isa<ConstantSDNode>(InsertVal.getOperand(1))) {
@@ -16672,18 +16723,47 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
     // Vec's operand 0 is using indices from 0 to N-1 and
     // operand 1 from N to 2N - 1, where N is the number of
     // elements in the vectors.
-    int XOffset = -1;
-    if (InsertVal.getOperand(0) == X) {
-      XOffset = 0;
-    } else if (InsertVal.getOperand(0) == Y) {
-      XOffset = X.getValueType().getVectorNumElements();
+    SDValue InsertVal0 = InsertVal.getOperand(0);
+    int ElementOffset = -1;
+
+    // We explore the inputs of the shuffle in order to see if we find the
+    // source of the extract_vector_elt. If so, we can use it to modify the
+    // shuffle rather than perform an insert_vector_elt.
+    SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
+    ArgWorkList.emplace_back(Mask.size(), Y);
+    ArgWorkList.emplace_back(0, X);
+
+    while (!ArgWorkList.empty()) {
+      int ArgOffset;
+      SDValue ArgVal;
+      std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();
+
+      if (ArgVal == InsertVal0) {
+        ElementOffset = ArgOffset;
+        break;
+      }
+
+      // Peek through concat_vector.
+      if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
+        int CurrentArgOffset =
+            ArgOffset + ArgVal.getValueType().getVectorNumElements();
+        int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
+        for (SDValue Op : reverse(ArgVal->ops())) {
+          CurrentArgOffset -= Step;
+          ArgWorkList.emplace_back(CurrentArgOffset, Op);
+        }
+
+        // Make sure we went through all the elements and did not screw up index
+        // computation.
+        assert(CurrentArgOffset == ArgOffset);
+      }
     }
 
-    if (XOffset != -1) {
+    if (ElementOffset != -1) {
       SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
 
       auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
-      NewMask[InsIndex] = XOffset + ExtrIndex->getZExtValue();
+      NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue();
       assert(NewMask[InsIndex] <
                  (int)(2 * Vec.getValueType().getVectorNumElements()) &&
              NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
@@ -16915,8 +16995,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   AddToWorklist(EVE);
   // Since we're explicitly calling ReplaceAllUses, add the new node to the
   // worklist explicitly as well.
-  AddUsersToWorklist(Load.getNode()); // Add users too
-  AddToWorklist(Load.getNode());
+  AddToWorklistWithUsers(Load.getNode());
   ++OpsNarrowed;
   return SDValue(EVE, 0);
 }
@@ -20436,7 +20515,7 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
 ///   Result = N X_i + X_i (N - N A X_i)
 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
                                       SDNodeFlags Flags) {
-  if (Level >= AfterLegalizeDAG)
+  if (LegalDAG)
     return SDValue();
 
   // TODO: Handle half and/or extended types?
@@ -20575,7 +20654,7 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
 /// Op can be zero.
 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
                                            bool Reciprocal) {
-  if (Level >= AfterLegalizeDAG)
+  if (LegalDAG)
     return SDValue();
 
   // TODO: Handle half and/or extended types?
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index f55c81b2f3d08..0d8a547a92561 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -410,8 +410,8 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
   else if (isa<ConstantPointerNull>(V))
     // Translate this as an integer zero so that it can be
     // local-CSE'd with actual integer zeros.
-    Reg = getRegForValue(
-        Constant::getNullValue(DL.getIntPtrType(V->getContext())));
+    Reg =
+        getRegForValue(Constant::getNullValue(DL.getIntPtrType(V->getType())));
   else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
     if (CF->isNullValue())
       Reg = fastMaterializeFloatZero(CF);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 17bb98bdddfb5..70cb20e48d20f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1023,8 +1023,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     // These pseudo-ops are the same as the other STRICT_ ops except
     // they are registered with setOperationAction() using the input type
     // instead of the output type.
-    Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
-                                            Node->getOperand(1).getValueType());
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(1).getValueType());
     break;
   case ISD::SIGN_EXTEND_INREG: {
     EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
@@ -3692,7 +3692,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     }
 
     SDValue Result = DAG.getBuildVector(Node->getValueType(0), dl, Scalars);
-    ReplaceNode(SDValue(Node, 0), Result);
+    Results.push_back(Result);
     break;
   }
   case ISD::VECREDUCE_FADD:
@@ -3720,7 +3720,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID:
     // FIXME: Custom lowering for these operations shouldn't return null!
-    break;
+    // Return true so that we don't call ConvertNodeToLibcall which also won't
+    // do anything.
+    return true;
   }
 
   if (!TLI.isStrictFPEnabled() && Results.empty() && Node->isStrictFPOpcode()) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index af963bc028026..a94efe74c9abe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -65,35 +65,60 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
     case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
+    case ISD::STRICT_FMINNUM:
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
+    case ISD::STRICT_FMAXNUM:
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
+    case ISD::STRICT_FADD:
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCBRT:       R = SoftenFloatRes_FCBRT(N); break;
+    case ISD::STRICT_FCEIL:
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
     case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
+    case ISD::STRICT_FCOS:
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::STRICT_FDIV:
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::STRICT_FEXP:
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::STRICT_FEXP2:
     case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::STRICT_FFLOOR:
     case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::STRICT_FLOG:
     case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::STRICT_FLOG2:
     case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::STRICT_FLOG10:
     case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::STRICT_FMA:
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
+    case ISD::STRICT_FMUL:
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::STRICT_FNEARBYINT:
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
     case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
+    case ISD::STRICT_FP_EXTEND:
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
+    case ISD::STRICT_FPOW:
     case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::STRICT_FPOWI:
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
+    case ISD::STRICT_FREM:
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::STRICT_FRINT:
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::STRICT_FROUND:
     case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
+    case ISD::STRICT_FSIN:
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::STRICT_FSQRT:
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::STRICT_FSUB:
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::STRICT_FTRUNC:
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
     case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
@@ -112,6 +137,46 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   }
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert(N->getNumOperands() == (1 + Offset) &&
+         "Unexpected number of operands!");
+  SDValue Op = GetSoftenedFloat(N->getOperand(0 + Offset));
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpVT = N->getOperand(0 + Offset).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC) {
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert(N->getNumOperands() == (2 + Offset) &&
+         "Unexpected number of operands!");
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     GetSoftenedFloat(N->getOperand(1 + Offset)) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
@@ -175,84 +240,48 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::FMIN_F32,
-                                           RTLIB::FMIN_F64,
-                                           RTLIB::FMIN_F80,
-                                           RTLIB::FMIN_F128,
-                                           RTLIB::FMIN_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::FMIN_F32,
+                                               RTLIB::FMIN_F64,
+                                               RTLIB::FMIN_F80,
+                                               RTLIB::FMIN_F128,
+                                               RTLIB::FMIN_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::FMAX_F32,
-                                           RTLIB::FMAX_F64,
-                                           RTLIB::FMAX_F80,
-                                           RTLIB::FMAX_F128,
-                                           RTLIB::FMAX_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::FMAX_F32,
+                                               RTLIB::FMAX_F64,
+                                               RTLIB::FMAX_F80,
+                                               RTLIB::FMAX_F128,
+                                               RTLIB::FMAX_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::ADD_F32,
-                                           RTLIB::ADD_F64,
-                                           RTLIB::ADD_F80,
-                                           RTLIB::ADD_F128,
-                                           RTLIB::ADD_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::ADD_F32,
+                                               RTLIB::ADD_F64,
+                                               RTLIB::ADD_F80,
+                                               RTLIB::ADD_F128,
+                                               RTLIB::ADD_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCBRT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
                                            RTLIB::CBRT_F32,
                                            RTLIB::CBRT_F64,
                                            RTLIB::CBRT_F80,
                                            RTLIB::CBRT_F128,
-                                           RTLIB::CBRT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+                                           RTLIB::CBRT_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::CEIL_F32,
-                                           RTLIB::CEIL_F64,
-                                           RTLIB::CEIL_F80,
-                                           RTLIB::CEIL_F128,
-                                           RTLIB::CEIL_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::CEIL_F32,
+                                              RTLIB::CEIL_F64,
+                                              RTLIB::CEIL_F80,
+                                              RTLIB::CEIL_F128,
+                                              RTLIB::CEIL_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
@@ -304,212 +333,150 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::COS_F32,
-                                           RTLIB::COS_F64,
-                                           RTLIB::COS_F80,
-                                           RTLIB::COS_F128,
-                                           RTLIB::COS_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::COS_F32,
+                                              RTLIB::COS_F64,
+                                              RTLIB::COS_F80,
+                                              RTLIB::COS_F128,
+                                              RTLIB::COS_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::DIV_F32,
-                                           RTLIB::DIV_F64,
-                                           RTLIB::DIV_F80,
-                                           RTLIB::DIV_F128,
-                                           RTLIB::DIV_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::DIV_F32,
+                                               RTLIB::DIV_F64,
+                                               RTLIB::DIV_F80,
+                                               RTLIB::DIV_F128,
+                                               RTLIB::DIV_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::EXP_F32,
-                                           RTLIB::EXP_F64,
-                                           RTLIB::EXP_F80,
-                                           RTLIB::EXP_F128,
-                                           RTLIB::EXP_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::EXP_F32,
+                                              RTLIB::EXP_F64,
+                                              RTLIB::EXP_F80,
+                                              RTLIB::EXP_F128,
+                                              RTLIB::EXP_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::EXP2_F32,
-                                           RTLIB::EXP2_F64,
-                                           RTLIB::EXP2_F80,
-                                           RTLIB::EXP2_F128,
-                                           RTLIB::EXP2_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::EXP2_F32,
+                                              RTLIB::EXP2_F64,
+                                              RTLIB::EXP2_F80,
+                                              RTLIB::EXP2_F128,
+                                              RTLIB::EXP2_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::FLOOR_F32,
-                                           RTLIB::FLOOR_F64,
-                                           RTLIB::FLOOR_F80,
-                                           RTLIB::FLOOR_F128,
-                                           RTLIB::FLOOR_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::FLOOR_F32,
+                                              RTLIB::FLOOR_F64,
+                                              RTLIB::FLOOR_F80,
+                                              RTLIB::FLOOR_F128,
+                                              RTLIB::FLOOR_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::LOG_F32,
-                                           RTLIB::LOG_F64,
-                                           RTLIB::LOG_F80,
-                                           RTLIB::LOG_F128,
-                                           RTLIB::LOG_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG_F32,
+                                              RTLIB::LOG_F64,
+                                              RTLIB::LOG_F80,
+                                              RTLIB::LOG_F128,
+                                              RTLIB::LOG_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::LOG2_F32,
-                                           RTLIB::LOG2_F64,
-                                           RTLIB::LOG2_F80,
-                                           RTLIB::LOG2_F128,
-                                           RTLIB::LOG2_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG2_F32,
+                                              RTLIB::LOG2_F64,
+                                              RTLIB::LOG2_F80,
+                                              RTLIB::LOG2_F128,
+                                              RTLIB::LOG2_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::LOG10_F32,
-                                           RTLIB::LOG10_F64,
-                                           RTLIB::LOG10_F80,
-                                           RTLIB::LOG10_F128,
-                                           RTLIB::LOG10_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::LOG10_F32,
+                                              RTLIB::LOG10_F64,
+                                              RTLIB::LOG10_F80,
+                                              RTLIB::LOG10_F128,
+                                              RTLIB::LOG10_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)),
-                     GetSoftenedFloat(N->getOperand(2)) };
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     GetSoftenedFloat(N->getOperand(1 + Offset)),
+                     GetSoftenedFloat(N->getOperand(2 + Offset)) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[3] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType(),
-                   N->getOperand(2).getValueType() };
+  EVT OpsVT[3] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType(),
+                   N->getOperand(2 + Offset).getValueType() };
   CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::FMA_F32,
-                                           RTLIB::FMA_F64,
-                                           RTLIB::FMA_F80,
-                                           RTLIB::FMA_F128,
-                                           RTLIB::FMA_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG,
+                                                    GetFPLibCall(N->getValueType(0),
+                                                                 RTLIB::FMA_F32,
+                                                                 RTLIB::FMA_F64,
+                                                                 RTLIB::FMA_F80,
+                                                                 RTLIB::FMA_F128,
+                                                                 RTLIB::FMA_PPCF128),
+                         NVT, Ops, CallOptions, SDLoc(N), Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::MUL_F32,
-                                           RTLIB::MUL_F64,
-                                           RTLIB::MUL_F80,
-                                           RTLIB::MUL_F128,
-                                           RTLIB::MUL_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::MUL_F32,
+                                               RTLIB::MUL_F64,
+                                               RTLIB::MUL_F80,
+                                               RTLIB::MUL_F128,
+                                               RTLIB::MUL_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::NEARBYINT_F32,
-                                           RTLIB::NEARBYINT_F64,
-                                           RTLIB::NEARBYINT_F80,
-                                           RTLIB::NEARBYINT_F128,
-                                           RTLIB::NEARBYINT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::NEARBYINT_F32,
+                                              RTLIB::NEARBYINT_F64,
+                                              RTLIB::NEARBYINT_F80,
+                                              RTLIB::NEARBYINT_F128,
+                                              RTLIB::NEARBYINT_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
-  EVT FloatVT = N->getValueType(0);
-  if (FloatVT == MVT::f32 || FloatVT == MVT::f64 || FloatVT == MVT::f128) {
-    // Expand Y = FNEG(X) -> Y = X ^ sign mask
-    APInt SignMask = APInt::getSignMask(NVT.getSizeInBits());
-    return DAG.getNode(ISD::XOR, dl, NVT, GetSoftenedFloat(N->getOperand(0)),
-                       DAG.getConstant(SignMask, dl, NVT));
-  }
-
-  // Expand Y = FNEG(X) -> Y = SUB -0.0, X
-  SDValue Ops[2] = { DAG.getConstantFP(-0.0, dl, N->getValueType(0)),
-                     GetSoftenedFloat(N->getOperand(0)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::SUB_F32,
-                                           RTLIB::SUB_F64,
-                                           RTLIB::SUB_F80,
-                                           RTLIB::SUB_F128,
-                                           RTLIB::SUB_PPCF128),
-                         NVT, Ops, CallOptions, dl).first;
+  // Expand Y = FNEG(X) -> Y = X ^ sign mask
+  APInt SignMask = APInt::getSignMask(NVT.getSizeInBits());
+  return DAG.getNode(ISD::XOR, dl, NVT, GetSoftenedFloat(N->getOperand(0)),
+                     DAG.getConstant(SignMask, dl, NVT));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = N->getOperand(0);
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
 
   // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's
   // entirely possible for both f16 and f32 to be legal, so use the fully
   // hard-float FP_EXTEND rather than FP16_TO_FP.
   if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
-    Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    if (IsStrict) {
+      Op = DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(N),
+                       { MVT::f32, MVT::Other }, { Chain, Op });
+      Chain = Op.getValue(1);
+    } else {
+      Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    }
+
     if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
       AddToWorklist(Op.getNode());
   }
@@ -526,9 +493,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
+  EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -551,41 +523,36 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = N->getOperand(0);
-  if (N->getValueType(0) == MVT::f16) {
-    // Semi-soften first, to FP_TO_FP16, so that targets which support f16 as a
-    // storage-only type get a chance to select things.
-    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, Op);
-  }
-
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
+  EVT OpVT = N->getOperand(IsStrict ? 1 : 0).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::POW_F32,
-                                           RTLIB::POW_F64,
-                                           RTLIB::POW_F80,
-                                           RTLIB::POW_F128,
-                                           RTLIB::POW_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::POW_F32,
+                                               RTLIB::POW_F64,
+                                               RTLIB::POW_F80,
+                                               RTLIB::POW_F128,
+                                               RTLIB::POW_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
-  assert(N->getOperand(1).getValueType() == MVT::i32 &&
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  assert(N->getOperand(1 + Offset).getValueType() == MVT::i32 &&
          "Unsupported power type!");
   RTLIB::Libcall LC = GetFPLibCall(N->getValueType(0),
                                    RTLIB::POWI_F32,
@@ -601,124 +568,82 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
   }
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0 + Offset)),
+                     N->getOperand(1 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
+  EVT OpsVT[2] = { N->getOperand(0 + Offset).getValueType(),
+                   N->getOperand(1 + Offset).getValueType() };
   CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, LC, NVT, Ops, CallOptions, SDLoc(N)).first;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Ops,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  return Tmp.first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::REM_F32,
-                                           RTLIB::REM_F64,
-                                           RTLIB::REM_F80,
-                                           RTLIB::REM_F128,
-                                           RTLIB::REM_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::REM_F32,
+                                               RTLIB::REM_F64,
+                                               RTLIB::REM_F80,
+                                               RTLIB::REM_F128,
+                                               RTLIB::REM_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::RINT_F32,
-                                           RTLIB::RINT_F64,
-                                           RTLIB::RINT_F80,
-                                           RTLIB::RINT_F128,
-                                           RTLIB::RINT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::RINT_F32,
+                                              RTLIB::RINT_F64,
+                                              RTLIB::RINT_F80,
+                                              RTLIB::RINT_F128,
+                                              RTLIB::RINT_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::ROUND_F32,
-                                           RTLIB::ROUND_F64,
-                                           RTLIB::ROUND_F80,
-                                           RTLIB::ROUND_F128,
-                                           RTLIB::ROUND_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::ROUND_F32,
+                                              RTLIB::ROUND_F64,
+                                              RTLIB::ROUND_F80,
+                                              RTLIB::ROUND_F128,
+                                              RTLIB::ROUND_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::SIN_F32,
-                                           RTLIB::SIN_F64,
-                                           RTLIB::SIN_F80,
-                                           RTLIB::SIN_F128,
-                                           RTLIB::SIN_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::SIN_F32,
+                                              RTLIB::SIN_F64,
+                                              RTLIB::SIN_F80,
+                                              RTLIB::SIN_F128,
+                                              RTLIB::SIN_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::SQRT_F32,
-                                           RTLIB::SQRT_F64,
-                                           RTLIB::SQRT_F80,
-                                           RTLIB::SQRT_F128,
-                                           RTLIB::SQRT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::SQRT_F32,
+                                              RTLIB::SQRT_F64,
+                                              RTLIB::SQRT_F80,
+                                              RTLIB::SQRT_F128,
+                                              RTLIB::SQRT_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
-                     GetSoftenedFloat(N->getOperand(1)) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
-                   N->getOperand(1).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::SUB_F32,
-                                           RTLIB::SUB_F64,
-                                           RTLIB::SUB_F80,
-                                           RTLIB::SUB_F128,
-                                           RTLIB::SUB_PPCF128),
-                         NVT, Ops, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                               RTLIB::SUB_F32,
+                                               RTLIB::SUB_F64,
+                                               RTLIB::SUB_F80,
+                                               RTLIB::SUB_F128,
+                                               RTLIB::SUB_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  if (N->getValueType(0) == MVT::f16)
-    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
-
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                           RTLIB::TRUNC_F32,
-                                           RTLIB::TRUNC_F64,
-                                           RTLIB::TRUNC_F80,
-                                           RTLIB::TRUNC_F128,
-                                           RTLIB::TRUNC_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::TRUNC_F32,
+                                              RTLIB::TRUNC_F64,
+                                              RTLIB::TRUNC_F80,
+                                              RTLIB::TRUNC_F128,
+                                              RTLIB::TRUNC_PPCF128));
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
@@ -845,18 +770,25 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
-  case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
+  case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
+  case ISD::STRICT_LROUND:
   case ISD::LROUND:      Res = SoftenFloatOp_LROUND(N); break;
+  case ISD::STRICT_LLROUND:
   case ISD::LLROUND:     Res = SoftenFloatOp_LLROUND(N); break;
+  case ISD::STRICT_LRINT:
   case ISD::LRINT:       Res = SoftenFloatOp_LRINT(N); break;
+  case ISD::STRICT_LLRINT:
   case ISD::LLRINT:      Res = SoftenFloatOp_LLRINT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
   case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
+  case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -868,7 +800,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
-         "Invalid operand promotion");
+         "Invalid operand softening");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
@@ -880,42 +812,34 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
   return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
-  // If we get here, the result must be legal but the source illegal.
-  EVT SVT = N->getOperand(0).getValueType();
-  EVT RVT = N->getValueType(0);
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-
-  if (SVT == MVT::f16)
-    return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), RVT, Op);
-
-  RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
-  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
-
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
-}
-
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   // We actually deal with the partially-softened FP_TO_FP16 node too, which
   // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
-  assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16);
+  assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16 ||
+         N->getOpcode() == ISD::STRICT_FP_ROUND);
 
-  EVT SVT = N->getOperand(0).getValueType();
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT SVT = Op.getValueType();
   EVT RVT = N->getValueType(0);
   EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  Op = GetSoftenedFloat(Op);
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
+  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict) {
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    ReplaceValueWith(SDValue(N, 0), Tmp.first);
+    return SDValue();
+  }
+  return Tmp.first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -943,8 +867,12 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
-  bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
-  EVT SVT = N->getOperand(0).getValueType();
+  bool IsStrict = N->isStrictFPOpcode();
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT ||
+                N->getOpcode() == ISD::STRICT_FP_TO_SINT;
+
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  EVT SVT = Op.getValueType();
   EVT RVT = N->getValueType(0);
   EVT NVT = EVT();
   SDLoc dl(N);
@@ -960,18 +888,26 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
     NVT = (MVT::SimpleValueType)IntVT;
     // The type needs to big enough to hold the result.
     if (NVT.bitsGE(RVT))
-      LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT):RTLIB::getFPTOUINT(SVT, NVT);
+      LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT) : RTLIB::getFPTOUINT(SVT, NVT);
   }
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
 
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  Op = GetSoftenedFloat(Op);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl).first;
+  CallOptions.setTypeListBeforeSoften(SVT, RVT, true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, dl, Chain);
 
   // Truncate the result if the libcall returns a larger type.
-  return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
+  SDValue Res = DAG.getNode(ISD::TRUNCATE, dl, RVT, Tmp.first);
+
+  if (!IsStrict)
+    return Res;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
@@ -1039,72 +975,99 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
                       ST->getMemOperand());
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_LROUND(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
 
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  EVT RetVT = N->getOperand(0).getValueType();
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
-                                           RTLIB::LROUND_F32,
-                                           RTLIB::LROUND_F64,
-                                           RTLIB::LROUND_F80,
-                                           RTLIB::LROUND_F128,
-                                           RTLIB::LROUND_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  EVT LVT = LHS.getValueType();
+  EVT ILVT = EVT::getIntegerVT(*DAG.getContext(), LVT.getSizeInBits());
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RSize - LSize;
+  if (SizeDiff > 0) {
+    RHS =
+        DAG.getNode(ISD::SRL, dl, RVT, RHS,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(RHS.getValueType(),
+                                                         DAG.getDataLayout())));
+    RHS = DAG.getNode(ISD::TRUNCATE, dl, ILVT, RHS);
+  } else if (SizeDiff < 0) {
+    RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, RHS);
+    RHS =
+        DAG.getNode(ISD::SHL, dl, ILVT, RHS,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(RHS.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  RHS = DAG.getBitcast(LVT, RHS);
+  return DAG.getNode(ISD::FCOPYSIGN, dl, LVT, LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
+SDValue DAGTypeLegalizer::SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  EVT RetVT = N->getOperand(0).getValueType();
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Op = GetSoftenedFloat(N->getOperand(0 + Offset));
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
-                                           RTLIB::LLROUND_F32,
-                                           RTLIB::LLROUND_F64,
-                                           RTLIB::LLROUND_F80,
-                                           RTLIB::LLROUND_F128,
-                                           RTLIB::LLROUND_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+  EVT OpVT = N->getOperand(0 + Offset).getValueType();
+  CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, NVT, Op,
+                                                    CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict) {
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    ReplaceValueWith(SDValue(N, 0), Tmp.first);
+    return SDValue();
+  }
+
+  return Tmp.first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+SDValue DAGTypeLegalizer::SoftenFloatOp_LROUND(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LROUND_F32,
+                                             RTLIB::LROUND_F64,
+                                             RTLIB::LROUND_F80,
+                                             RTLIB::LROUND_F128,
+                                             RTLIB::LROUND_PPCF128));
+}
 
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  EVT RetVT = N->getOperand(0).getValueType();
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
-                                           RTLIB::LRINT_F32,
-                                           RTLIB::LRINT_F64,
-                                           RTLIB::LRINT_F80,
-                                           RTLIB::LRINT_F128,
-                                           RTLIB::LRINT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LLROUND_F32,
+                                             RTLIB::LLROUND_F64,
+                                             RTLIB::LLROUND_F80,
+                                             RTLIB::LLROUND_F128,
+                                             RTLIB::LLROUND_PPCF128));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
-  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LRINT_F32,
+                                             RTLIB::LRINT_F64,
+                                             RTLIB::LRINT_F80,
+                                             RTLIB::LRINT_F128,
+                                             RTLIB::LRINT_PPCF128));
+}
 
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  EVT RetVT = N->getOperand(0).getValueType();
-  TargetLowering::MakeLibCallOptions CallOptions;
-  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
-  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
-  return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
-                                           RTLIB::LLRINT_F32,
-                                           RTLIB::LLRINT_F64,
-                                           RTLIB::LLRINT_F80,
-                                           RTLIB::LLRINT_F128,
-                                           RTLIB::LLRINT_PPCF128),
-                         NVT, Op, CallOptions, SDLoc(N)).first;
+SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
+  EVT OpVT = N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType();
+  return SoftenFloatOp_Unary(N, GetFPLibCall(OpVT,
+                                             RTLIB::LLRINT_F32,
+                                             RTLIB::LLRINT_F64,
+                                             RTLIB::LLRINT_F80,
+                                             RTLIB::LLRINT_F128,
+                                             RTLIB::LLRINT_PPCF128));
 }
 
 //===----------------------------------------------------------------------===//
@@ -1145,36 +1108,61 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::STRICT_FMINNUM:
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
+  case ISD::STRICT_FMAXNUM:
   case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
+  case ISD::STRICT_FADD:
   case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
   case ISD::FCBRT:      ExpandFloatRes_FCBRT(N, Lo, Hi); break;
+  case ISD::STRICT_FCEIL:
   case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::STRICT_FCOS:
   case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::STRICT_FDIV:
   case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::STRICT_FEXP:
   case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::STRICT_FEXP2:
   case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::STRICT_FFLOOR:
   case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG:
   case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG2:
   case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::STRICT_FLOG10:
   case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::STRICT_FMA:
   case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
+  case ISD::STRICT_FMUL:
   case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::STRICT_FNEARBYINT:
   case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
   case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::STRICT_FPOW:
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::STRICT_FPOWI:
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::STRICT_FRINT:
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::STRICT_FROUND:
   case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
+  case ISD::STRICT_FSIN:
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::STRICT_FSQRT:
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::STRICT_FSUB:
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::STRICT_FTRUNC:
   case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
   case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  case ISD::STRICT_FREM:
   case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
   }
 
@@ -1198,6 +1186,36 @@ void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
                          dl, NVT);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_Unary(SDNode *N, RTLIB::Libcall LC,
+                                            SDValue &Lo, SDValue &Hi) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Op = N->getOperand(0 + Offset);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                                                    Op, CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
+                                             SDValue &Lo, SDValue &Hi) {
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[] = { N->getOperand(0 + Offset), N->getOperand(1 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, N->getValueType(0),
+                                                    Ops, CallOptions, SDLoc(N),
+                                                    Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   assert(N->getValueType(0) == MVT::ppcf128 &&
@@ -1214,190 +1232,159 @@ void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::FMIN_F32, RTLIB::FMIN_F64,
-                                         RTLIB::FMIN_F80, RTLIB::FMIN_F128,
-                                         RTLIB::FMIN_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                       RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                       RTLIB::FMIN_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
                                               SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::FMAX_F32, RTLIB::FMAX_F64,
-                                         RTLIB::FMAX_F80, RTLIB::FMAX_F128,
-                                         RTLIB::FMAX_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                        RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                        RTLIB::FMAX_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::ADD_F32, RTLIB::ADD_F64,
-                                         RTLIB::ADD_F80, RTLIB::ADD_F128,
-                                         RTLIB::ADD_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                        RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                        RTLIB::ADD_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCBRT(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32,
-                                         RTLIB::CBRT_F64, RTLIB::CBRT_F80,
-                                         RTLIB::CBRT_F128, RTLIB::CBRT_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32,
+                                       RTLIB::CBRT_F64, RTLIB::CBRT_F80,
+                                       RTLIB::CBRT_F128,
+                                       RTLIB::CBRT_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::CEIL_F32, RTLIB::CEIL_F64,
-                                         RTLIB::CEIL_F80, RTLIB::CEIL_F128,
-                                         RTLIB::CEIL_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                       RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                       RTLIB::CEIL_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::COPYSIGN_F32,
-                                         RTLIB::COPYSIGN_F64,
-                                         RTLIB::COPYSIGN_F80,
-                                         RTLIB::COPYSIGN_F128,
-                                         RTLIB::COPYSIGN_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::COPYSIGN_F32,
+                                        RTLIB::COPYSIGN_F64,
+                                        RTLIB::COPYSIGN_F80,
+                                        RTLIB::COPYSIGN_F128,
+                                        RTLIB::COPYSIGN_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::COS_F32, RTLIB::COS_F64,
-                                         RTLIB::COS_F80, RTLIB::COS_F128,
-                                         RTLIB::COS_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::COS_F32, RTLIB::COS_F64,
+                                       RTLIB::COS_F80, RTLIB::COS_F128,
+                                       RTLIB::COS_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                                   RTLIB::DIV_F32,
-                                                   RTLIB::DIV_F64,
-                                                   RTLIB::DIV_F80,
-                                                   RTLIB::DIV_F128,
-                                                   RTLIB::DIV_PPCF128),
-                                 N->getValueType(0), Ops, CallOptions,
-                                 SDLoc(N)).first;
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::DIV_F32,
+                                        RTLIB::DIV_F64,
+                                        RTLIB::DIV_F80,
+                                        RTLIB::DIV_F128,
+                                        RTLIB::DIV_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::EXP_F32, RTLIB::EXP_F64,
-                                         RTLIB::EXP_F80, RTLIB::EXP_F128,
-                                         RTLIB::EXP_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                       RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                       RTLIB::EXP_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::EXP2_F32, RTLIB::EXP2_F64,
-                                         RTLIB::EXP2_F80, RTLIB::EXP2_F128,
-                                         RTLIB::EXP2_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                       RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                       RTLIB::EXP2_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
-                                         RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
-                                         RTLIB::FLOOR_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                       RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                       RTLIB::FLOOR_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::LOG_F32, RTLIB::LOG_F64,
-                                         RTLIB::LOG_F80, RTLIB::LOG_F128,
-                                         RTLIB::LOG_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                       RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                       RTLIB::LOG_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::LOG2_F32, RTLIB::LOG2_F64,
-                                         RTLIB::LOG2_F80, RTLIB::LOG2_F128,
-                                         RTLIB::LOG2_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                       RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                       RTLIB::LOG2_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::LOG10_F32, RTLIB::LOG10_F64,
-                                         RTLIB::LOG10_F80, RTLIB::LOG10_F128,
-                                         RTLIB::LOG10_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                       RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                       RTLIB::LOG10_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
-  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  bool IsStrict = N->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Ops[3] = { N->getOperand(0 + Offset), N->getOperand(1 + Offset),
+                     N->getOperand(2 + Offset) };
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
   TargetLowering::MakeLibCallOptions CallOptions;
-  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::FMA_F32,
                                                    RTLIB::FMA_F64,
                                                    RTLIB::FMA_F80,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
                                  N->getValueType(0), Ops, CallOptions,
-                                 SDLoc(N)).first;
-  GetPairElements(Call, Lo, Hi);
+                                 SDLoc(N), Chain);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  GetPairElements(Tmp.first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::MUL_F32,
                                                    RTLIB::MUL_F64,
                                                    RTLIB::MUL_F80,
                                                    RTLIB::MUL_F128,
-                                                   RTLIB::MUL_PPCF128),
-                                 N->getValueType(0), Ops, CallOptions,
-                                 SDLoc(N)).first;
-  GetPairElements(Call, Lo, Hi);
+                                                   RTLIB::MUL_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
                                                  SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::NEARBYINT_F32,
-                                         RTLIB::NEARBYINT_F64,
-                                         RTLIB::NEARBYINT_F80,
-                                         RTLIB::NEARBYINT_F128,
-                                         RTLIB::NEARBYINT_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::NEARBYINT_F32,
+                                       RTLIB::NEARBYINT_F64,
+                                       RTLIB::NEARBYINT_F80,
+                                       RTLIB::NEARBYINT_F128,
+                                       RTLIB::NEARBYINT_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
@@ -1412,106 +1399,105 @@ void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
-  Hi = DAG.getNode(ISD::FP_EXTEND, dl, NVT, N->getOperand(0));
+  bool IsStrict = N->isStrictFPOpcode();
+
+  SDValue Chain;
+  if (IsStrict) {
+    // If the expanded type is the same as the input type, just bypass the node.
+    if (NVT == N->getOperand(1).getValueType()) {
+      Hi = N->getOperand(1);
+      Chain = N->getOperand(0);
+    } else {
+      // Other we need to extend.
+      Hi = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, { NVT, MVT::Other },
+                       { N->getOperand(0), N->getOperand(1) });
+      Chain = Hi.getValue(1);
+    }
+  } else {
+    Hi = DAG.getNode(ISD::FP_EXTEND, dl, NVT, N->getOperand(0));
+  }
+
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::POW_F32, RTLIB::POW_F64,
-                                         RTLIB::POW_F80, RTLIB::POW_F128,
-                                         RTLIB::POW_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::POW_F32, RTLIB::POW_F64,
+                                        RTLIB::POW_F80, RTLIB::POW_F128,
+                                        RTLIB::POW_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::POWI_F32, RTLIB::POWI_F64,
-                                         RTLIB::POWI_F80, RTLIB::POWI_F128,
-                                         RTLIB::POWI_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                        RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                        RTLIB::POWI_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::REM_F32, RTLIB::REM_F64,
-                                         RTLIB::REM_F80, RTLIB::REM_F128,
-                                         RTLIB::REM_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::REM_F32, RTLIB::REM_F64,
+                                        RTLIB::REM_F80, RTLIB::REM_F128,
+                                        RTLIB::REM_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::RINT_F32, RTLIB::RINT_F64,
-                                         RTLIB::RINT_F80, RTLIB::RINT_F128,
-                                         RTLIB::RINT_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                       RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                       RTLIB::RINT_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::ROUND_F32,
-                                         RTLIB::ROUND_F64,
-                                         RTLIB::ROUND_F80,
-                                         RTLIB::ROUND_F128,
-                                         RTLIB::ROUND_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::ROUND_F32,
+                                       RTLIB::ROUND_F64,
+                                       RTLIB::ROUND_F80,
+                                       RTLIB::ROUND_F128,
+                                       RTLIB::ROUND_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::SIN_F32, RTLIB::SIN_F64,
-                                         RTLIB::SIN_F80, RTLIB::SIN_F128,
-                                         RTLIB::SIN_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                       RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                       RTLIB::SIN_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::SQRT_F32, RTLIB::SQRT_F64,
-                                         RTLIB::SQRT_F80, RTLIB::SQRT_F128,
-                                         RTLIB::SQRT_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                       RTLIB::SQRT_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  TargetLowering::MakeLibCallOptions CallOptions;
-  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
-                                                   RTLIB::SUB_F32,
-                                                   RTLIB::SUB_F64,
-                                                   RTLIB::SUB_F80,
-                                                   RTLIB::SUB_F128,
-                                                   RTLIB::SUB_PPCF128),
-                                 N->getValueType(0), Ops, CallOptions,
-                                 SDLoc(N)).first;
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
+                                        RTLIB::SUB_F32,
+                                        RTLIB::SUB_F64,
+                                        RTLIB::SUB_F80,
+                                        RTLIB::SUB_F128,
+                                        RTLIB::SUB_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
-  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
-                                         RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
-                                         RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
-                                         RTLIB::TRUNC_PPCF128),
-                            N, false);
-  GetPairElements(Call, Lo, Hi);
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                       RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                       RTLIB::TRUNC_PPCF128), Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
@@ -1652,8 +1638,11 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
   case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
+  case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
   case ISD::LROUND:     Res = ExpandFloatOp_LROUND(N); break;
   case ISD::LLROUND:    Res = ExpandFloatOp_LLROUND(N); break;
@@ -1742,34 +1731,72 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
-  assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+  bool IsStrict = N->isStrictFPOpcode();
+  assert(N->getOperand(IsStrict ? 1 : 0).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
   SDValue Lo, Hi;
-  GetExpandedFloat(N->getOperand(0), Lo, Hi);
-  // Round it the rest of the way (e.g. to f32) if needed.
-  return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
-                     N->getValueType(0), Hi, N->getOperand(1));
+  GetExpandedFloat(N->getOperand(IsStrict ? 1 : 0), Lo, Hi);
+
+  if (!IsStrict)
+    // Round it the rest of the way (e.g. to f32) if needed.
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                       N->getValueType(0), Hi, N->getOperand(1));
+
+  // Eliminate the node if the input float type is the same as the output float
+  // type.
+  if (Hi.getValueType() == N->getValueType(0)) {
+    // Connect the output chain to the input chain, unlinking the node.
+    ReplaceValueWith(SDValue(N, 1), N->getOperand(0));
+    ReplaceValueWith(SDValue(N, 0), Hi);
+    return SDValue();
+  }
+
+  SDValue Expansion = DAG.getNode(ISD::STRICT_FP_ROUND, SDLoc(N),
+                                  {N->getValueType(0), MVT::Other},
+                                  {N->getOperand(0), Hi, N->getOperand(2)});
+  ReplaceValueWith(SDValue(N, 1), Expansion.getValue(1));
+  ReplaceValueWith(SDValue(N, 0), Expansion);
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
-  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), CallOptions, dl).first;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
+                                                    CallOptions, dl, Chain);
+  if (!IsStrict)
+    return Tmp.first;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   SDLoc dl(N);
 
-  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
-                         CallOptions, dl).first;
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, RVT, Op,
+                                                    CallOptions, dl, Chain);
+  if (!IsStrict)
+    return Tmp.first;
+
+  ReplaceValueWith(SDValue(N, 1), Tmp.second);
+  ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 56c13bb0753d2..dd082646ae5ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -592,8 +592,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
 
   SDLoc dl(N);
   SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
-                                  N->getMask(), ExtPassThru, N->getMemoryVT(),
-                                  N->getMemOperand(), ISD::EXTLOAD);
+                                  N->getOffset(), N->getMask(), ExtPassThru,
+                                  N->getMemoryVT(), N->getMemOperand(),
+                                  N->getAddressingMode(), ISD::EXTLOAD);
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -1485,11 +1486,11 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
   SDLoc dl(N);
 
   bool TruncateStore = false;
-  if (OpNo == 3) {
+  if (OpNo == 4) {
     Mask = PromoteTargetBoolean(Mask, DataVT);
     // Update in place.
     SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
-    NewOps[3] = Mask;
+    NewOps[4] = Mask;
     return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
   } else { // Data operand
     assert(OpNo == 1 && "Unexpected operand for promotion");
@@ -1497,14 +1498,15 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
     TruncateStore = true;
   }
 
-  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
-                            N->getMemoryVT(), N->getMemOperand(),
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(),
+                            N->getOffset(), Mask, N->getMemoryVT(),
+                            N->getMemOperand(), N->getAddressingMode(),
                             TruncateStore, N->isCompressingStore());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
                                              unsigned OpNo) {
-  assert(OpNo == 2 && "Only know how to promote the mask!");
+  assert(OpNo == 3 && "Only know how to promote the mask!");
   EVT DataVT = N->getValueType(0);
   SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
@@ -1696,7 +1698,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
   case ISD::FLT_ROUNDS_: ExpandIntRes_FLT_ROUNDS(N, Lo, Hi); break;
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
   case ISD::STRICT_LLROUND:
   case ISD::STRICT_LLRINT:
@@ -2562,7 +2566,9 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
-  SDValue Op = N->getOperand(0);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
@@ -2570,8 +2576,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
   TargetLowering::MakeLibCallOptions CallOptions;
   CallOptions.setSExt(true);
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
-               Lo, Hi);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, VT, Op,
+                                                    CallOptions, dl, Chain);
+  SplitInteger(Tmp.first, Lo, Hi);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
@@ -2579,15 +2589,21 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
-  SDValue Op = N->getOperand(0);
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+  SDValue Op = N->getOperand(IsStrict ? 1 : 0);
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
   TargetLowering::MakeLibCallOptions CallOptions;
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
-               Lo, Hi);
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(DAG, LC, VT, Op,
+                                                    CallOptions, dl, Chain);
+  SplitInteger(Tmp.first, Lo, Hi);
+
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_LLROUND_LLRINT(SDNode *N, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 9ddcbc9065251..7a97d980f9e4f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -974,32 +974,6 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
   return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
 }
 
-/// Convert the node into a libcall with the same prototype.
-SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
-                                     bool isSigned) {
-  TargetLowering::MakeLibCallOptions CallOptions;
-  CallOptions.setSExt(isSigned);
-  unsigned NumOps = N->getNumOperands();
-  SDLoc dl(N);
-  if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, CallOptions,
-                           dl).first;
-  } else if (NumOps == 1) {
-    SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, CallOptions,
-                           dl).first;
-  } else if (NumOps == 2) {
-    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions,
-                           dl).first;
-  }
-  SmallVector<SDValue, 8> Ops(NumOps);
-  for (unsigned i = 0; i < NumOps; ++i)
-    Ops[i] = N->getOperand(i);
-
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions, dl).first;
-}
-
 /// Promote the given target boolean to a target boolean of the given type.
 /// A target boolean is an integer value, not necessarily of type i1, the bits
 /// of which conform to getBooleanContents.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index c944bda3700bf..42597fcd12ecb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -215,7 +215,6 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
 
   SDValue JoinIntegers(SDValue Lo, SDValue Hi);
-  SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
 
   std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
 
@@ -483,6 +482,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   // Convert Float Results to Integer.
   void SoftenFloatResult(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_Unary(SDNode *N, RTLIB::Libcall LC);
+  SDValue SoftenFloatRes_Binary(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_BITCAST(SDNode *N);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
@@ -528,9 +529,9 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   // Convert Float Operand to Integer.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_Unary(SDNode *N, RTLIB::Libcall LC);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
-  SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
   SDValue SoftenFloatOp_LROUND(SDNode *N);
@@ -540,6 +541,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
 
   //===--------------------------------------------------------------------===//
   // Float Expansion Support: LegalizeFloatTypes.cpp
@@ -557,6 +559,10 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   // Float Result Expansion.
   void ExpandFloatResult(SDNode *N, unsigned ResNo);
   void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_Unary(SDNode *N, RTLIB::Libcall LC,
+                            SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC,
+                             SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7bca3ea888ec4..9403b344ea747 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1541,12 +1541,15 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
 
 void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                                          SDValue &Lo, SDValue &Hi) {
+  assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
   EVT LoVT, HiVT;
   SDLoc dl(MLD);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
 
   SDValue Ch = MLD->getChain();
   SDValue Ptr = MLD->getBasePtr();
+  SDValue Offset = MLD->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed masked load offset");
   SDValue Mask = MLD->getMask();
   SDValue PassThru = MLD->getPassThru();
   unsigned Alignment = MLD->getOriginalAlignment();
@@ -1578,8 +1581,9 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MLD->getAAInfo(), MLD->getRanges());
 
-  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, PassThruLo, LoMemVT, MMO,
-                         ExtType, MLD->isExpandingLoad());
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
+                         MMO, MLD->getAddressingMode(), ExtType,
+                         MLD->isExpandingLoad());
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
                                    MLD->isExpandingLoad());
@@ -1590,8 +1594,9 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
       HiMemVT.getStoreSize(), Alignment, MLD->getAAInfo(),
       MLD->getRanges());
 
-  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, PassThruHi, HiMemVT, MMO,
-                         ExtType, MLD->isExpandingLoad());
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi, HiMemVT,
+                         MMO, MLD->getAddressingMode(), ExtType,
+                         MLD->isExpandingLoad());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -2326,8 +2331,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
 SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                                             unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed masked store of vector?");
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
+  SDValue Offset = N->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed masked store offset");
   SDValue Mask = N->getMask();
   SDValue Data = N->getValue();
   EVT MemoryVT = N->getMemoryVT();
@@ -2361,8 +2369,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
                          MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
                          Alignment, N->getAAInfo(), N->getRanges());
 
-  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
-                          N->isTruncatingStore(),
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
+                          N->getAddressingMode(), N->isTruncatingStore(),
                           N->isCompressingStore());
 
   Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
@@ -2374,8 +2382,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
       HiMemVT.getStoreSize(), Alignment, N->getAAInfo(),
       N->getRanges());
 
-  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
-                          N->isTruncatingStore(), N->isCompressingStore());
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
+                          N->getAddressingMode(), N->isTruncatingStore(),
+                          N->isCompressingStore());
 
   // Build a factor node to remember that this store is independent of the
   // other one.
@@ -3699,10 +3708,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
                                     WidenVT.getVectorNumElements());
   Mask = ModifyToType(Mask, WideMaskVT, true);
 
-  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
-                                  Mask, PassThru, N->getMemoryVT(),
-                                  N->getMemOperand(), ExtType,
-                                  N->isExpandingLoad());
+  SDValue Res = DAG.getMaskedLoad(
+      WidenVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+      ExtType, N->isExpandingLoad());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -4447,7 +4456,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
          StVal.getValueType().getVectorNumElements() &&
          "Mask and data vectors should have the same number of elements");
   return DAG.getMaskedStore(MST->getChain(), dl, StVal, MST->getBasePtr(),
-                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
+                            MST->getOffset(), Mask, MST->getMemoryVT(),
+                            MST->getMemOperand(), MST->getAddressingMode(),
                             false, MST->isCompressingStore());
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f1b88d80f43be..c1c599c5a5d83 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3350,20 +3350,20 @@ SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
     KnownBits N0Known = computeKnownBits(N0);
 
     bool overflow;
-    (void)(~N0Known.Zero).uadd_ov(~N1Known.Zero, overflow);
+    (void)N0Known.getMaxValue().uadd_ov(N1Known.getMaxValue(), overflow);
     if (!overflow)
       return OFK_Never;
   }
 
   // mulhi + 1 never overflow
   if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
-      (~N1Known.Zero & 0x01) == ~N1Known.Zero)
+      (N1Known.getMaxValue() & 0x01) == N1Known.getMaxValue())
     return OFK_Never;
 
   if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
     KnownBits N0Known = computeKnownBits(N0);
 
-    if ((~N0Known.Zero & 0x01) == ~N0Known.Zero)
+    if ((N0Known.getMaxValue() & 0x01) == N0Known.getMaxValue())
       return OFK_Never;
   }
 
@@ -6975,16 +6975,22 @@ SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
-                                    SDValue Ptr, SDValue Mask, SDValue PassThru,
-                                    EVT MemVT, MachineMemOperand *MMO,
+                                    SDValue Base, SDValue Offset, SDValue Mask,
+                                    SDValue PassThru, EVT MemVT,
+                                    MachineMemOperand *MMO,
+                                    ISD::MemIndexedMode AM,
                                     ISD::LoadExtType ExtTy, bool isExpanding) {
-  SDVTList VTs = getVTList(VT, MVT::Other);
-  SDValue Ops[] = { Chain, Ptr, Mask, PassThru };
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) &&
+         "Unindexed masked load with an offset!");
+  SDVTList VTs = Indexed ? getVTList(VT, Base.getValueType(), MVT::Other)
+                         : getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Base, Offset, Mask, PassThru};
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedLoadSDNode>(
-      dl.getIROrder(), VTs, ExtTy, isExpanding, MemVT, MMO));
+      dl.getIROrder(), VTs, AM, ExtTy, isExpanding, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -6992,7 +6998,7 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
-                                        ExtTy, isExpanding, MemVT, MMO);
+                                        AM, ExtTy, isExpanding, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
@@ -7002,27 +7008,45 @@ SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
   return V;
 }
 
+SDValue SelectionDAG::getIndexedMaskedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                           SDValue Base, SDValue Offset,
+                                           ISD::MemIndexedMode AM) {
+  MaskedLoadSDNode *LD = cast<MaskedLoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Masked load is already a indexed load!");
+  return getMaskedLoad(OrigLoad.getValueType(), dl, LD->getChain(), Base,
+                       Offset, LD->getMask(), LD->getPassThru(),
+                       LD->getMemoryVT(), LD->getMemOperand(), AM,
+                       LD->getExtensionType(), LD->isExpandingLoad());
+}
+
 SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
-                                     SDValue Val, SDValue Ptr, SDValue Mask,
-                                     EVT MemVT, MachineMemOperand *MMO,
-                                     bool IsTruncating, bool IsCompressing) {
+                                     SDValue Val, SDValue Base, SDValue Offset,
+                                     SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO,
+                                     ISD::MemIndexedMode AM, bool IsTruncating,
+                                     bool IsCompressing) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
-  SDVTList VTs = getVTList(MVT::Other);
-  SDValue Ops[] = { Chain, Val, Ptr, Mask };
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) &&
+         "Unindexed masked store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Base.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Base, Offset, Mask};
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedStoreSDNode>(
-      dl.getIROrder(), VTs, IsTruncating, IsCompressing, MemVT, MMO));
+      dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
-                                         IsTruncating, IsCompressing, MemVT, MMO);
+  auto *N =
+      newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   IsTruncating, IsCompressing, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
@@ -7032,6 +7056,17 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
   return V;
 }
 
+SDValue SelectionDAG::getIndexedMaskedStore(SDValue OrigStore, const SDLoc &dl,
+                                            SDValue Base, SDValue Offset,
+                                            ISD::MemIndexedMode AM) {
+  MaskedStoreSDNode *ST = cast<MaskedStoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() &&
+         "Masked store is already a indexed store!");
+  return getMaskedStore(ST->getChain(), dl, ST->getValue(), Base, Offset,
+                        ST->getMask(), ST->getMemoryVT(), ST->getMemOperand(),
+                        AM, ST->isTruncatingStore(), ST->isCompressingStore());
+}
+
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
                                       MachineMemOperand *MMO,
@@ -7287,8 +7322,40 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
   if (VTList.NumVTs == 1)
     return getNode(Opcode, DL, VTList.VTs[0], Ops);
 
-#if 0
   switch (Opcode) {
+  case ISD::STRICT_FP_EXTEND:
+    assert(VTList.NumVTs == 2 && Ops.size() == 2 &&
+           "Invalid STRICT_FP_EXTEND!");
+    assert(VTList.VTs[0].isFloatingPoint() &&
+           Ops[1].getValueType().isFloatingPoint() && "Invalid FP cast!");
+    assert(VTList.VTs[0].isVector() == Ops[1].getValueType().isVector() &&
+           "STRICT_FP_EXTEND result type should be vector iff the operand "
+           "type is vector!");
+    assert((!VTList.VTs[0].isVector() ||
+            VTList.VTs[0].getVectorNumElements() ==
+            Ops[1].getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Ops[1].getValueType().bitsLT(VTList.VTs[0]) &&
+           "Invalid fpext node, dst <= src!");
+    break;
+  case ISD::STRICT_FP_ROUND:
+    assert(VTList.NumVTs == 2 && Ops.size() == 3 && "Invalid STRICT_FP_ROUND!");
+    assert(VTList.VTs[0].isVector() == Ops[1].getValueType().isVector() &&
+           "STRICT_FP_ROUND result type should be vector iff the operand "
+           "type is vector!");
+    assert((!VTList.VTs[0].isVector() ||
+            VTList.VTs[0].getVectorNumElements() ==
+            Ops[1].getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(VTList.VTs[0].isFloatingPoint() &&
+           Ops[1].getValueType().isFloatingPoint() &&
+           VTList.VTs[0].bitsLT(Ops[1].getValueType()) &&
+           isa<ConstantSDNode>(Ops[2]) &&
+           (cast<ConstantSDNode>(Ops[2])->getZExtValue() == 0 ||
+            cast<ConstantSDNode>(Ops[2])->getZExtValue() == 1) &&
+           "Invalid STRICT_FP_ROUND!");
+    break;
+#if 0
   // FIXME: figure out how to safely handle things like
   // int foo(int x) { return 1 << (x & 255); }
   // int bar() { return foo(256); }
@@ -7307,8 +7374,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
           return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
       }
     break;
-  }
 #endif
+  }
 
   // Memoize the node unless it returns a flag.
   SDNode *N;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 1ed0dc2c979fc..0aeb3c14aa370 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4295,6 +4295,7 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
   SDValue Mask = getValue(MaskOperand);
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
@@ -4311,9 +4312,9 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
                           // vectors.
                           VT.getStoreSize().getKnownMinSize(),
                           Alignment, AAInfo);
-  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
-                                         MMO, false /* Truncating */,
-                                         IsCompressing);
+  SDValue StoreNode =
+      DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
+                         ISD::UNINDEXED, false /* Truncating */, IsCompressing);
   DAG.setRoot(StoreNode);
   setValue(&I, StoreNode);
 }
@@ -4461,6 +4462,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   SDValue Ptr = getValue(PtrOperand);
   SDValue Src0 = getValue(Src0Operand);
   SDValue Mask = getValue(MaskOperand);
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
@@ -4491,8 +4493,9 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
                           VT.getStoreSize().getKnownMinSize(),
                           Alignment, AAInfo, Ranges);
 
-  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
-                                   ISD::NON_EXTLOAD, IsExpanding);
+  SDValue Load =
+      DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
+                        ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
   if (AddToChain)
     PendingLoads.push_back(Load.getValue(1));
   setValue(&I, Load);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index bc10f76212394..f863d9876486b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -685,6 +685,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     if (doExt)
       OS << " from " << MLd->getMemoryVT().getEVTString();
 
+    const char *AM = getIndexedModeName(MLd->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
     if (MLd->isExpandingLoad())
       OS << ", expanding";
 
@@ -696,6 +700,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     if (MSt->isTruncatingStore())
       OS << ", trunc to " << MSt->getMemoryVT().getEVTString();
 
+    const char *AM = getIndexedModeName(MSt->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
     if (MSt->isCompressingStore())
       OS << ", compressing";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 79dbd44bb4772..a03f7923d71e1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -148,17 +148,17 @@ static cl::opt<bool>
 ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before legalize types"));
 static cl::opt<bool>
-ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
-          cl::desc("Pop up a window to show dags before legalize"));
+    ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+                     cl::desc("Pop up a window to show dags before the post "
+                              "legalize types dag combine pass"));
+static cl::opt<bool>
+    ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+                     cl::desc("Pop up a window to show dags before legalize"));
 static cl::opt<bool>
 ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the second "
                    "dag combine pass"));
 static cl::opt<bool>
-ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
-          cl::desc("Pop up a window to show dags before the post legalize types"
-                   " dag combine pass"));
-static cl::opt<bool>
 ViewISelDAGs("view-isel-dags", cl::Hidden,
           cl::desc("Pop up a window to show isel dags as they are selected"));
 static cl::opt<bool>
@@ -168,12 +168,10 @@ static cl::opt<bool>
 ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
       cl::desc("Pop up a window to show SUnit dags after they are processed"));
 #else
-static const bool ViewDAGCombine1 = false,
-                  ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
-                  ViewDAGCombine2 = false,
-                  ViewDAGCombineLT = false,
-                  ViewISelDAGs = false, ViewSchedDAGs = false,
-                  ViewSUnitDAGs = false;
+static const bool ViewDAGCombine1 = false, ViewLegalizeTypesDAGs = false,
+                  ViewDAGCombineLT = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false, ViewISelDAGs = false,
+                  ViewSchedDAGs = false, ViewSUnitDAGs = false;
 #endif
 
 //===---------------------------------------------------------------------===//
@@ -790,8 +788,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
                        FuncInfo->MBB->getBasicBlock()->getName());
 #endif
 #ifdef NDEBUG
-  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
-      ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewDAGCombineLT ||
+      ViewLegalizeDAGs || ViewDAGCombine2 || ViewISelDAGs || ViewSchedDAGs ||
       ViewSUnitDAGs)
 #endif
   {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c24a3670c9867..1e51ec2d6a219 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6225,6 +6225,26 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
     }
   }
 
+  // If none of the above worked, but there are no NaNs, then expand to
+  // a compare/select sequence.  This is required for correctness since
+  // InstCombine might have canonicalized a fcmp+select sequence to a
+  // FMINNUM/FMAXNUM node.  If we were to fall through to the default
+  // expansion to libcall, we might introduce a link-time dependency
+  // on libm into a file that originally did not have one.
+  if (Node->getFlags().hasNoNaNs()) {
+    ISD::CondCode Pred =
+        Node->getOpcode() == ISD::FMINNUM ? ISD::SETLT : ISD::SETGT;
+    SDValue Op1 = Node->getOperand(0);
+    SDValue Op2 = Node->getOperand(1);
+    SDValue SelCC = DAG.getSelectCC(dl, Op1, Op2, Op1, Op2, Pred);
+    // Copy FMF flags, but always set the no-signed-zeros flag
+    // as this is implied by the FMINNUM/FMAXNUM semantics.
+    SDNodeFlags Flags = Node->getFlags();
+    Flags.setNoSignedZeros(true);
+    SelCC->setFlags(Flags);
+    return SelCC;
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index af7dc432eae5b..cc436fcc4f684 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -633,6 +633,8 @@ void TargetLoweringBase::initActions() {
          IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
       setIndexedLoadAction(IM, VT, Expand);
       setIndexedStoreAction(IM, VT, Expand);
+      setIndexedMaskedLoadAction(IM, VT, Expand);
+      setIndexedMaskedStoreAction(IM, VT, Expand);
     }
 
     // Most backends expect to see the node which just returns the value loaded.
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 7b547d41fb60b..41cb511ad9b47 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -179,10 +179,10 @@ static cl::opt<CFLAAType> UseCFLAA(
 /// Option names for limiting the codegen pipeline.
 /// Those are used in error reporting and we didn't want
 /// to duplicate their names all over the place.
-static const char *StartAfterOptName = "start-after";
-static const char *StartBeforeOptName = "start-before";
-static const char *StopAfterOptName = "stop-after";
-static const char *StopBeforeOptName = "stop-before";
+static const char StartAfterOptName[] = "start-after";
+static const char StartBeforeOptName[] = "start-before";
+static const char StopAfterOptName[] = "stop-after";
+static const char StopBeforeOptName[] = "stop-before";
 
 static cl::opt<std::string>
     StartAfterOpt(StringRef(StartAfterOptName),
diff --git a/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
similarity index 78%
rename from llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
rename to llvm/lib/CodeGen/TypePromotion.cpp
index 1c2c8aef55bb8..94fe7d2c70304 100644
--- a/llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -1,4 +1,4 @@
-//===----- ARMCodeGenPrepare.cpp ------------------------------------------===//
+//===----- TypePromotion.cpp ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,23 +7,25 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This pass inserts intrinsics to handle small types that would otherwise be
-/// promoted during legalization. Here we can manually promote types or insert
-/// intrinsics which can handle narrow types that aren't supported by the
-/// register classes.
-//
+/// This is an opcode based type promotion pass for small types that would
+/// otherwise be promoted during legalisation. This works around the limitations
+/// of selection dag for cyclic regions. The search begins from icmp
+/// instructions operands where a tree, consisting of non-wrapping or safe
+/// wrapping instructions, is built, checked and promoted if possible.
+///
 //===----------------------------------------------------------------------===//
 
-#include "ARM.h"
-#include "ARMSubtarget.h"
-#include "ARMTargetMachine.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -32,26 +34,19 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 
-#define DEBUG_TYPE "arm-codegenprepare"
+#define DEBUG_TYPE "type-promotion"
+#define PASS_NAME "Type Promotion"
 
 using namespace llvm;
 
 static cl::opt<bool>
-DisableCGP("arm-disable-cgp", cl::Hidden, cl::init(true),
-           cl::desc("Disable ARM specific CodeGenPrepare pass"));
-
-static cl::opt<bool>
-EnableDSP("arm-enable-scalar-dsp", cl::Hidden, cl::init(false),
-          cl::desc("Use DSP instructions for scalar operations"));
-
-static cl::opt<bool>
-EnableDSPWithImms("arm-enable-scalar-dsp-imms", cl::Hidden, cl::init(false),
-                   cl::desc("Use DSP instructions for scalar operations\
-                            with immediate operands"));
+DisablePromotion("disable-type-promotion", cl::Hidden, cl::init(true),
+                 cl::desc("Disable type promotion pass"));
 
 // The goal of this pass is to enable more efficient code generation for
 // operations on narrow types (i.e. types with < 32-bits) and this is a
@@ -111,7 +106,6 @@ class IRPromoter {
   SmallPtrSet<Instruction*, 4> InstsToRemove;
   DenseMap<Value*, SmallVector<Type*, 4>> TruncTysMap;
   SmallPtrSet<Value*, 8> Promoted;
-  Module *M = nullptr;
   LLVMContext &Ctx;
   // The type we promote to: always i32
   IntegerType *ExtTy = nullptr;
@@ -134,11 +128,10 @@ class IRPromoter {
   void Cleanup(void);
 
 public:
-  IRPromoter(Module *M) : M(M), Ctx(M->getContext()),
-                          ExtTy(Type::getInt32Ty(Ctx)) { }
+  IRPromoter(Module *M) : Ctx(M->getContext()) { }
 
 
-  void Mutate(Type *OrigTy,
+  void Mutate(Type *OrigTy, unsigned PromotedWidth,
               SetVector<Value*> &Visited,
               SmallPtrSetImpl<Value*> &Sources,
               SmallPtrSetImpl<Instruction*> &Sinks,
@@ -146,30 +139,29 @@ class IRPromoter {
               SmallPtrSetImpl<Instruction*> &SafeWrap);
 };
 
-class ARMCodeGenPrepare : public FunctionPass {
-  const ARMSubtarget *ST = nullptr;
+class TypePromotion : public FunctionPass {
   IRPromoter *Promoter = nullptr;
-  std::set<Value*> AllVisited;
+  SmallPtrSet<Value*, 16> AllVisited;
   SmallPtrSet<Instruction*, 8> SafeToPromote;
   SmallPtrSet<Instruction*, 4> SafeWrap;
 
   bool isSafeWrap(Instruction *I);
   bool isSupportedValue(Value *V);
   bool isLegalToPromote(Value *V);
-  bool TryToPromote(Value *V);
+  bool TryToPromote(Value *V, unsigned PromotedWidth);
 
 public:
   static char ID;
   static unsigned TypeSize;
   Type *OrigTy = nullptr;
 
-  ARMCodeGenPrepare() : FunctionPass(ID) {}
+  TypePromotion() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetPassConfig>();
   }
 
-  StringRef getPassName() const override { return "ARM IR optimizations"; }
+  StringRef getPassName() const override { return PASS_NAME; }
 
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
@@ -188,19 +180,19 @@ static bool GenerateSignBits(Value *V) {
 }
 
 static bool EqualTypeSize(Value *V) {
-  return V->getType()->getScalarSizeInBits() == ARMCodeGenPrepare::TypeSize;
+  return V->getType()->getScalarSizeInBits() == TypePromotion::TypeSize;
 }
 
 static bool LessOrEqualTypeSize(Value *V) {
-  return V->getType()->getScalarSizeInBits() <= ARMCodeGenPrepare::TypeSize;
+  return V->getType()->getScalarSizeInBits() <= TypePromotion::TypeSize;
 }
 
 static bool GreaterThanTypeSize(Value *V) {
-  return V->getType()->getScalarSizeInBits() > ARMCodeGenPrepare::TypeSize;
+  return V->getType()->getScalarSizeInBits() > TypePromotion::TypeSize;
 }
 
 static bool LessThanTypeSize(Value *V) {
-  return V->getType()->getScalarSizeInBits() < ARMCodeGenPrepare::TypeSize;
+  return V->getType()->getScalarSizeInBits() < TypePromotion::TypeSize;
 }
 
 /// Some instructions can use 8- and 16-bit operands, and we don't need to
@@ -278,7 +270,7 @@ static bool isSink(Value *V) {
 }
 
 /// Return whether this instruction can safely wrap.
-bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) {
+bool TypePromotion::isSafeWrap(Instruction *I) {
   // We can support a, potentially, wrapping instruction (I) if:
   // - It is only used by an unsigned icmp.
   // - The icmp uses a constant.
@@ -374,7 +366,7 @@ bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) {
   Total += OverflowConst->getValue().getBitWidth() < 32 ?
     OverflowConst->getValue().abs().zext(32) : OverflowConst->getValue().abs();
 
-  APInt Max = APInt::getAllOnesValue(ARMCodeGenPrepare::TypeSize);
+  APInt Max = APInt::getAllOnesValue(TypePromotion::TypeSize);
 
   if (Total.getBitWidth() > Max.getBitWidth()) {
     if (Total.ugt(Max.zext(Total.getBitWidth())))
@@ -385,7 +377,7 @@ bool ARMCodeGenPrepare::isSafeWrap(Instruction *I) {
   } else if (Total.ugt(Max))
     return false;
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Allowing safe overflow for " << *I << "\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Allowing safe overflow for " << *I << "\n");
   SafeWrap.insert(I);
   return true;
 }
@@ -422,32 +414,12 @@ static bool isPromotedResultSafe(Value *V) {
   return cast<Instruction>(V)->hasNoUnsignedWrap();
 }
 
-/// Return the intrinsic for the instruction that can perform the same
-/// operation but on a narrow type. This is using the parallel dsp intrinsics
-/// on scalar values.
-static Intrinsic::ID getNarrowIntrinsic(Instruction *I) {
-  // Whether we use the signed or unsigned versions of these intrinsics
-  // doesn't matter because we're not using the GE bits that they set in
-  // the APSR.
-  switch(I->getOpcode()) {
-  default:
-    break;
-  case Instruction::Add:
-    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_uadd16 :
-      Intrinsic::arm_uadd8;
-  case Instruction::Sub:
-    return ARMCodeGenPrepare::TypeSize == 16 ? Intrinsic::arm_usub16 :
-      Intrinsic::arm_usub8;
-  }
-  llvm_unreachable("unhandled opcode for narrow intrinsic");
-}
-
 void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
   SmallVector<Instruction*, 4> Users;
   Instruction *InstTo = dyn_cast<Instruction>(To);
   bool ReplacedAll = true;
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Replacing " << *From << " with " << *To
+  LLVM_DEBUG(dbgs() << "IR Promotion: Replacing " << *From << " with " << *To
              << "\n");
 
   for (Use &U : From->uses()) {
@@ -468,7 +440,7 @@ void IRPromoter::ReplaceAllUsersOfWith(Value *From, Value *To) {
 }
 
 void IRPromoter::PrepareWrappingAdds() {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Prepare underflowing adds.\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Prepare wrapping adds.\n");
   IRBuilder<> Builder{Ctx};
 
   // For adds that safely wrap and use a negative immediate as operand 1, we
@@ -479,7 +451,7 @@ void IRPromoter::PrepareWrappingAdds() {
     if (I->getOpcode() != Instruction::Add)
       continue;
 
-    LLVM_DEBUG(dbgs() << "ARM CGP: Adjusting " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "IR Promotion: Adjusting " << *I << "\n");
     assert((isa<ConstantInt>(I->getOperand(1)) &&
             cast<ConstantInt>(I->getOperand(1))->isNegative()) &&
            "Wrapping should have a negative immediate as the second operand");
@@ -494,7 +466,7 @@ void IRPromoter::PrepareWrappingAdds() {
     }
     InstsToRemove.insert(I);
     I->replaceAllUsesWith(NewVal);
-    LLVM_DEBUG(dbgs() << "ARM CGP: New equivalent: " << *NewVal << "\n");
+    LLVM_DEBUG(dbgs() << "IR Promotion: New equivalent: " << *NewVal << "\n");
   }
   for (auto *I : NewInsts)
     Visited->insert(I);
@@ -505,7 +477,7 @@ void IRPromoter::ExtendSources() {
 
   auto InsertZExt = [&](Value *V, Instruction *InsertPt) {
     assert(V->getType() != ExtTy && "zext already extends to i32");
-    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting ZExt for " << *V << "\n");
+    LLVM_DEBUG(dbgs() << "IR Promotion: Inserting ZExt for " << *V << "\n");
     Builder.SetInsertPoint(InsertPt);
     if (auto *I = dyn_cast<Instruction>(V))
       Builder.SetCurrentDebugLocation(I->getDebugLoc());
@@ -523,7 +495,7 @@ void IRPromoter::ExtendSources() {
   };
 
   // Now, insert extending instructions between the sources and their users.
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting sources:\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting sources:\n");
   for (auto V : *Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
     if (auto *I = dyn_cast<Instruction>(V))
@@ -539,7 +511,7 @@ void IRPromoter::ExtendSources() {
 }
 
 void IRPromoter::PromoteTree() {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Mutating the tree..\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Mutating the tree..\n");
 
   IRBuilder<> Builder{Ctx};
 
@@ -570,38 +542,10 @@ void IRPromoter::PromoteTree() {
       Promoted.insert(I);
     }
   }
-
-  // Finally, any instructions that should be promoted but haven't yet been,
-  // need to be handled using intrinsics.
-  for (auto *V : *Visited) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-
-    if (Sources->count(I) || Sinks->count(I))
-      continue;
-
-    if (!shouldPromote(I) || SafeToPromote->count(I) || NewInsts.count(I))
-      continue;
-
-    assert(EnableDSP && "DSP intrinisc insertion not enabled!");
-
-    // Replace unsafe instructions with appropriate intrinsic calls.
-    LLVM_DEBUG(dbgs() << "ARM CGP: Inserting DSP intrinsic for "
-               << *I << "\n");
-    Function *DSPInst =
-      Intrinsic::getDeclaration(M, getNarrowIntrinsic(I));
-    Builder.SetInsertPoint(I);
-    Builder.SetCurrentDebugLocation(I->getDebugLoc());
-    Value *Args[] = { I->getOperand(0), I->getOperand(1) };
-    CallInst *Call = Builder.CreateCall(DSPInst, Args);
-    NewInsts.insert(Call);
-    ReplaceAllUsersOfWith(I, Call);
-  }
 }
 
 void IRPromoter::TruncateSinks() {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Fixing up the sinks:\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Fixing up the sinks:\n");
 
   IRBuilder<> Builder{Ctx};
 
@@ -612,7 +556,7 @@ void IRPromoter::TruncateSinks() {
     if ((!Promoted.count(V) && !NewInsts.count(V)) || Sources->count(V))
       return nullptr;
 
-    LLVM_DEBUG(dbgs() << "ARM CGP: Creating " << *TruncTy << " Trunc for "
+    LLVM_DEBUG(dbgs() << "IR Promotion: Creating " << *TruncTy << " Trunc for "
                << *V << "\n");
     Builder.SetInsertPoint(cast<Instruction>(V));
     auto *Trunc = dyn_cast<Instruction>(Builder.CreateTrunc(V, TruncTy));
@@ -624,7 +568,7 @@ void IRPromoter::TruncateSinks() {
   // Fix up any stores or returns that use the results of the promoted
   // chain.
   for (auto I : *Sinks) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: For Sink: " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "IR Promotion: For Sink: " << *I << "\n");
 
     // Handle calls separately as we need to iterate over arg operands.
     if (auto *Call = dyn_cast<CallInst>(I)) {
@@ -661,7 +605,7 @@ void IRPromoter::TruncateSinks() {
 }
 
 void IRPromoter::Cleanup() {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Cleanup..\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Cleanup..\n");
   // Some zexts will now have become redundant, along with their trunc
   // operands, so remove them
   for (auto V : *Visited) {
@@ -674,7 +618,7 @@ void IRPromoter::Cleanup() {
 
     Value *Src = ZExt->getOperand(0);
     if (ZExt->getSrcTy() == ZExt->getDestTy()) {
-      LLVM_DEBUG(dbgs() << "ARM CGP: Removing unnecessary cast: " << *ZExt
+      LLVM_DEBUG(dbgs() << "IR Promotion: Removing unnecessary cast: " << *ZExt
                  << "\n");
       ReplaceAllUsersOfWith(ZExt, Src);
       continue;
@@ -693,7 +637,7 @@ void IRPromoter::Cleanup() {
   }
 
   for (auto *I : InstsToRemove) {
-    LLVM_DEBUG(dbgs() << "ARM CGP: Removing " << *I << "\n");
+    LLVM_DEBUG(dbgs() << "IR Promotion: Removing " << *I << "\n");
     I->dropAllReferences();
     I->eraseFromParent();
   }
@@ -707,7 +651,7 @@ void IRPromoter::Cleanup() {
 }
 
 void IRPromoter::ConvertTruncs() {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Converting truncs..\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Converting truncs..\n");
   IRBuilder<> Builder{Ctx};
 
   for (auto *V : *Visited) {
@@ -731,17 +675,18 @@ void IRPromoter::ConvertTruncs() {
   }
 }
 
-void IRPromoter::Mutate(Type *OrigTy,
+void IRPromoter::Mutate(Type *OrigTy, unsigned PromotedWidth,
                         SetVector<Value*> &Visited,
                         SmallPtrSetImpl<Value*> &Sources,
                         SmallPtrSetImpl<Instruction*> &Sinks,
                         SmallPtrSetImpl<Instruction*> &SafeToPromote,
                         SmallPtrSetImpl<Instruction*> &SafeWrap) {
-  LLVM_DEBUG(dbgs() << "ARM CGP: Promoting use-def chains to from "
-             << ARMCodeGenPrepare::TypeSize << " to 32-bits\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Promoting use-def chains to from "
+             << TypePromotion::TypeSize << " to 32-bits\n");
 
   assert(isa<IntegerType>(OrigTy) && "expected integer type");
   this->OrigTy = cast<IntegerType>(OrigTy);
+  ExtTy = IntegerType::get(Ctx, PromotedWidth);
   assert(OrigTy->getPrimitiveSizeInBits() < ExtTy->getPrimitiveSizeInBits() &&
          "original type not smaller than extended type");
 
@@ -779,9 +724,7 @@ void IRPromoter::Mutate(Type *OrigTy,
   // Insert zext instructions between sources and their users.
   ExtendSources();
 
-  // Promote visited instructions, mutating their types in place. Also insert
-  // DSP intrinsics, if enabled, for adds and subs which would be unsafe to
-  // promote.
+  // Promote visited instructions, mutating their types in place.
   PromoteTree();
 
   // Convert any truncs, that aren't sources, into AND masks.
@@ -794,14 +737,14 @@ void IRPromoter::Mutate(Type *OrigTy,
   // clear the data structures.
   Cleanup();
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Mutation complete\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: Mutation complete\n");
 }
 
 /// We accept most instructions, as well as Arguments and ConstantInsts. We
 /// Disallow casts other than zext and truncs and only allow calls if their
 /// return value is zeroext. We don't allow opcodes that can introduce sign
 /// bits.
-bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
+bool TypePromotion::isSupportedValue(Value *V) {
   if (auto *I = dyn_cast<Instruction>(V)) {
     switch (I->getOpcode()) {
     default:
@@ -849,7 +792,7 @@ bool ARMCodeGenPrepare::isSupportedValue(Value *V) {
 /// Check that the type of V would be promoted and that the original type is
 /// smaller than the targeted promoted type. Check that we're not trying to
 /// promote something larger than our base 'TypeSize' type.
-bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
+bool TypePromotion::isLegalToPromote(Value *V) {
 
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
@@ -862,47 +805,20 @@ bool ARMCodeGenPrepare::isLegalToPromote(Value *V) {
     SafeToPromote.insert(I);
     return true;
   }
-
-  if (I->getOpcode() != Instruction::Add && I->getOpcode() != Instruction::Sub)
-    return false;
-
-  // If promotion is not safe, can we use a DSP instruction to natively
-  // handle the narrow type?
-  if (!ST->hasDSP() || !EnableDSP || !isSupportedType(I))
-    return false;
-
-  if (ST->isThumb() && !ST->hasThumb2())
-    return false;
-
-  // TODO
-  // Would it be profitable? For Thumb code, these parallel DSP instructions
-  // are only Thumb-2, so we wouldn't be able to dual issue on Cortex-M33. For
-  // Cortex-A, specifically Cortex-A72, the latency is double and throughput is
-  // halved. They also do not take immediates as operands.
-  for (auto &Op : I->operands()) {
-    if (isa<Constant>(Op)) {
-      if (!EnableDSPWithImms)
-        return false;
-    }
-  }
-  LLVM_DEBUG(dbgs() << "ARM CGP: Will use an intrinsic for: " << *I << "\n");
-  return true;
+  return false;
 }
 
-bool ARMCodeGenPrepare::TryToPromote(Value *V) {
+bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
   OrigTy = V->getType();
   TypeSize = OrigTy->getPrimitiveSizeInBits();
-  if (TypeSize > 16 || TypeSize < 8)
-    return false;
-
   SafeToPromote.clear();
   SafeWrap.clear();
 
   if (!isSupportedValue(V) || !shouldPromote(V) || !isLegalToPromote(V))
     return false;
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: TryToPromote: " << *V << ", TypeSize = "
-             << TypeSize << "\n");
+  LLVM_DEBUG(dbgs() << "IR Promotion: TryToPromote: " << *V << ", from "
+             << TypeSize << " bits to " << PromotedWidth << "\n");
 
   SetVector<Value*> WorkList;
   SmallPtrSet<Value*, 8> Sources;
@@ -923,7 +839,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
       return true;
 
     if (!isSupportedValue(V) || (shouldPromote(V) && !isLegalToPromote(V))) {
-      LLVM_DEBUG(dbgs() << "ARM CGP: Can't handle: " << *V << "\n");
+      LLVM_DEBUG(dbgs() << "IR Promotion: Can't handle: " << *V << "\n");
       return false;
     }
 
@@ -979,7 +895,7 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
     }
   }
 
-  LLVM_DEBUG(dbgs() << "ARM CGP: Visited nodes:\n";
+  LLVM_DEBUG(dbgs() << "IR Promotion: Visited nodes:\n";
              for (auto *I : CurrentVisited)
                I->dump();
              );
@@ -995,28 +911,31 @@ bool ARMCodeGenPrepare::TryToPromote(Value *V) {
   if (ToPromote < 2)
     return false;
 
-  Promoter->Mutate(OrigTy, CurrentVisited, Sources, Sinks, SafeToPromote,
-                   SafeWrap);
+  Promoter->Mutate(OrigTy, PromotedWidth, CurrentVisited, Sources, Sinks,
+                   SafeToPromote, SafeWrap);
   return true;
 }
 
-bool ARMCodeGenPrepare::doInitialization(Module &M) {
+bool TypePromotion::doInitialization(Module &M) {
   Promoter = new IRPromoter(&M);
   return false;
 }
 
-bool ARMCodeGenPrepare::runOnFunction(Function &F) {
-  if (skipFunction(F) || DisableCGP)
+bool TypePromotion::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisablePromotion)
     return false;
 
-  auto *TPC = &getAnalysis<TargetPassConfig>();
+  LLVM_DEBUG(dbgs() << "IR Promotion: Running on " << F.getName() << "\n");
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC)
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  ST = &TM.getSubtarget<ARMSubtarget>(F);
   bool MadeChange = false;
-  LLVM_DEBUG(dbgs() << "ARM CGP: Running on " << F.getName() << "\n");
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetMachine &TM = TPC->getTM<TargetMachine>();
+  const TargetSubtargetInfo *SubtargetInfo = TM.getSubtargetImpl(F);
+  const TargetLowering *TLI = SubtargetInfo->getTargetLowering();
 
   // Search up from icmps to try to promote their operands.
   for (BasicBlock &BB : F) {
@@ -1025,18 +944,30 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
       if (AllVisited.count(&I))
         continue;
 
-      if (isa<ICmpInst>(I)) {
-        auto &CI = cast<ICmpInst>(I);
+      if (!isa<ICmpInst>(&I))
+        continue;
+
+      auto *ICmp = cast<ICmpInst>(&I);
+      // Skip signed or pointer compares
+      if (ICmp->isSigned() ||
+          !isa<IntegerType>(ICmp->getOperand(0)->getType()))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "IR Promotion: Searching from: " << *ICmp << "\n");
 
-        // Skip signed or pointer compares
-        if (CI.isSigned() || !isa<IntegerType>(CI.getOperand(0)->getType()))
-          continue;
+      for (auto &Op : ICmp->operands()) {
+        if (auto *I = dyn_cast<Instruction>(Op)) {
+          EVT SrcVT = TLI->getValueType(DL, I->getType());
+          if (SrcVT.isSimple() && TLI->isTypeLegal(SrcVT.getSimpleVT()))
+            break;
 
-        LLVM_DEBUG(dbgs() << "ARM CGP: Searching from: " << CI << "\n");
+          if (TLI->getTypeAction(ICmp->getContext(), SrcVT) !=
+              TargetLowering::TypePromoteInteger)
+            break;
 
-        for (auto &Op : CI.operands()) {
-          if (auto *I = dyn_cast<Instruction>(Op))
-            MadeChange |= TryToPromote(I);
+          EVT PromotedVT = TLI->getTypeToTransformTo(ICmp->getContext(), SrcVT);
+          MadeChange |= TryToPromote(I, PromotedVT.getSizeInBits());
+          break;
         }
       }
     }
@@ -1046,24 +977,22 @@ bool ARMCodeGenPrepare::runOnFunction(Function &F) {
                });
   }
   if (MadeChange)
-    LLVM_DEBUG(dbgs() << "After ARMCodeGenPrepare: " << F << "\n");
+    LLVM_DEBUG(dbgs() << "After TypePromotion: " << F << "\n");
 
   return MadeChange;
 }
 
-bool ARMCodeGenPrepare::doFinalization(Module &M) {
+bool TypePromotion::doFinalization(Module &M) {
   delete Promoter;
   return false;
 }
 
-INITIALIZE_PASS_BEGIN(ARMCodeGenPrepare, DEBUG_TYPE,
-                      "ARM IR optimizations", false, false)
-INITIALIZE_PASS_END(ARMCodeGenPrepare, DEBUG_TYPE, "ARM IR optimizations",
-                    false, false)
+INITIALIZE_PASS_BEGIN(TypePromotion, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_END(TypePromotion, DEBUG_TYPE, PASS_NAME, false, false)
 
-char ARMCodeGenPrepare::ID = 0;
-unsigned ARMCodeGenPrepare::TypeSize = 0;
+char TypePromotion::ID = 0;
+unsigned TypePromotion::TypeSize = 0;
 
-FunctionPass *llvm::createARMCodeGenPreparePass() {
-  return new ARMCodeGenPrepare();
+FunctionPass *llvm::createTypePromotionPass() {
+  return new TypePromotion();
 }
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index b868abf695823..41cbdf0355585 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -120,139 +120,14 @@ std::string EVT::getEVTString() const {
              + getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
+    if (isFloatingPoint())
+      return "f" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
-  case MVT::i1:      return "i1";
-  case MVT::i8:      return "i8";
-  case MVT::i16:     return "i16";
-  case MVT::i32:     return "i32";
-  case MVT::i64:     return "i64";
-  case MVT::i128:    return "i128";
-  case MVT::f16:     return "f16";
-  case MVT::f32:     return "f32";
-  case MVT::f64:     return "f64";
-  case MVT::f80:     return "f80";
-  case MVT::f128:    return "f128";
   case MVT::ppcf128: return "ppcf128";
   case MVT::isVoid:  return "isVoid";
   case MVT::Other:   return "ch";
   case MVT::Glue:    return "glue";
   case MVT::x86mmx:  return "x86mmx";
-  case MVT::v1i1:    return "v1i1";
-  case MVT::v2i1:    return "v2i1";
-  case MVT::v4i1:    return "v4i1";
-  case MVT::v8i1:    return "v8i1";
-  case MVT::v16i1:   return "v16i1";
-  case MVT::v32i1:   return "v32i1";
-  case MVT::v64i1:   return "v64i1";
-  case MVT::v128i1:  return "v128i1";
-  case MVT::v256i1:  return "v256i1";
-  case MVT::v512i1:  return "v512i1";
-  case MVT::v1024i1: return "v1024i1";
-  case MVT::v1i8:    return "v1i8";
-  case MVT::v2i8:    return "v2i8";
-  case MVT::v4i8:    return "v4i8";
-  case MVT::v8i8:    return "v8i8";
-  case MVT::v16i8:   return "v16i8";
-  case MVT::v32i8:   return "v32i8";
-  case MVT::v64i8:   return "v64i8";
-  case MVT::v128i8:  return "v128i8";
-  case MVT::v256i8:  return "v256i8";
-  case MVT::v1i16:   return "v1i16";
-  case MVT::v2i16:   return "v2i16";
-  case MVT::v3i16:   return "v3i16";
-  case MVT::v4i16:   return "v4i16";
-  case MVT::v8i16:   return "v8i16";
-  case MVT::v16i16:  return "v16i16";
-  case MVT::v32i16:  return "v32i16";
-  case MVT::v64i16:  return "v64i16";
-  case MVT::v128i16: return "v128i16";
-  case MVT::v1i32:   return "v1i32";
-  case MVT::v2i32:   return "v2i32";
-  case MVT::v3i32:   return "v3i32";
-  case MVT::v4i32:   return "v4i32";
-  case MVT::v5i32:   return "v5i32";
-  case MVT::v8i32:   return "v8i32";
-  case MVT::v16i32:  return "v16i32";
-  case MVT::v32i32:  return "v32i32";
-  case MVT::v64i32:  return "v64i32";
-  case MVT::v128i32: return "v128i32";
-  case MVT::v256i32: return "v256i32";
-  case MVT::v512i32: return "v512i32";
-  case MVT::v1024i32:return "v1024i32";
-  case MVT::v2048i32:return "v2048i32";
-  case MVT::v1i64:   return "v1i64";
-  case MVT::v2i64:   return "v2i64";
-  case MVT::v4i64:   return "v4i64";
-  case MVT::v8i64:   return "v8i64";
-  case MVT::v16i64:  return "v16i64";
-  case MVT::v32i64:  return "v32i64";
-  case MVT::v1i128:  return "v1i128";
-  case MVT::v1f32:   return "v1f32";
-  case MVT::v2f32:   return "v2f32";
-  case MVT::v2f16:   return "v2f16";
-  case MVT::v3f16:   return "v3f16";
-  case MVT::v4f16:   return "v4f16";
-  case MVT::v8f16:   return "v8f16";
-  case MVT::v16f16:  return "v16f16";
-  case MVT::v32f16:  return "v32f16";
-  case MVT::v3f32:   return "v3f32";
-  case MVT::v4f32:   return "v4f32";
-  case MVT::v5f32:   return "v5f32";
-  case MVT::v8f32:   return "v8f32";
-  case MVT::v16f32:  return "v16f32";
-  case MVT::v32f32:  return "v32f32";
-  case MVT::v64f32:  return "v64f32";
-  case MVT::v128f32: return "v128f32";
-  case MVT::v256f32: return "v256f32";
-  case MVT::v512f32: return "v512f32";
-  case MVT::v1024f32:return "v1024f32";
-  case MVT::v2048f32:return "v2048f32";
-  case MVT::v1f64:   return "v1f64";
-  case MVT::v2f64:   return "v2f64";
-  case MVT::v4f64:   return "v4f64";
-  case MVT::v8f64:   return "v8f64";
-  case MVT::nxv1i1:  return "nxv1i1";
-  case MVT::nxv2i1:  return "nxv2i1";
-  case MVT::nxv4i1:  return "nxv4i1";
-  case MVT::nxv8i1:  return "nxv8i1";
-  case MVT::nxv16i1: return "nxv16i1";
-  case MVT::nxv32i1: return "nxv32i1";
-  case MVT::nxv1i8:  return "nxv1i8";
-  case MVT::nxv2i8:  return "nxv2i8";
-  case MVT::nxv4i8:  return "nxv4i8";
-  case MVT::nxv8i8:  return "nxv8i8";
-  case MVT::nxv16i8: return "nxv16i8";
-  case MVT::nxv32i8: return "nxv32i8";
-  case MVT::nxv1i16: return "nxv1i16";
-  case MVT::nxv2i16: return "nxv2i16";
-  case MVT::nxv4i16: return "nxv4i16";
-  case MVT::nxv8i16: return "nxv8i16";
-  case MVT::nxv16i16:return "nxv16i16";
-  case MVT::nxv32i16:return "nxv32i16";
-  case MVT::nxv1i32: return "nxv1i32";
-  case MVT::nxv2i32: return "nxv2i32";
-  case MVT::nxv4i32: return "nxv4i32";
-  case MVT::nxv8i32: return "nxv8i32";
-  case MVT::nxv16i32:return "nxv16i32";
-  case MVT::nxv32i32:return "nxv32i32";
-  case MVT::nxv1i64: return "nxv1i64";
-  case MVT::nxv2i64: return "nxv2i64";
-  case MVT::nxv4i64: return "nxv4i64";
-  case MVT::nxv8i64: return "nxv8i64";
-  case MVT::nxv16i64:return "nxv16i64";
-  case MVT::nxv32i64:return "nxv32i64";
-  case MVT::nxv2f16: return "nxv2f16";
-  case MVT::nxv4f16: return "nxv4f16";
-  case MVT::nxv8f16: return "nxv8f16";
-  case MVT::nxv1f32: return "nxv1f32";
-  case MVT::nxv2f32: return "nxv2f32";
-  case MVT::nxv4f32: return "nxv4f32";
-  case MVT::nxv8f32: return "nxv8f32";
-  case MVT::nxv16f32:return "nxv16f32";
-  case MVT::nxv1f64: return "nxv1f64";
-  case MVT::nxv2f64: return "nxv2f64";
-  case MVT::nxv4f64: return "nxv4f64";
-  case MVT::nxv8f64: return "nxv8f64";
   case MVT::Metadata:return "Metadata";
   case MVT::Untyped: return "Untyped";
   case MVT::exnref : return "exnref";
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 4e70e232a9b5e..b268d2e6aef52 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -288,6 +288,7 @@ static void dumpRnglistsSection(
 static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
                                 DWARFDataExtractor Data,
                                 const MCRegisterInfo *MRI,
+                                const DWARFObject &Obj,
                                 Optional<uint64_t> DumpOffset) {
   uint64_t Offset = 0;
 
@@ -306,13 +307,13 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
     if (DumpOffset) {
       if (DumpOffset >= Offset && DumpOffset < EndOffset) {
         Offset = *DumpOffset;
-        Loc.dumpLocationList(&Offset, OS, /*BaseAddr=*/None, MRI, nullptr,
+        Loc.dumpLocationList(&Offset, OS, /*BaseAddr=*/None, MRI, Obj, nullptr,
                              DumpOpts, /*Indent=*/0);
         OS << "\n";
         return;
       }
     } else {
-      Loc.dumpRange(Offset, EndOffset - Offset, OS, MRI, DumpOpts);
+      Loc.dumpRange(Offset, EndOffset - Offset, OS, MRI, Obj, DumpOpts);
     }
     Offset = EndOffset;
   }
@@ -394,21 +395,21 @@ void DWARFContext::dump(
 
   if (const auto *Off = shouldDump(Explicit, ".debug_loc", DIDT_ID_DebugLoc,
                                    DObj->getLocSection().Data)) {
-    getDebugLoc()->dump(OS, getRegisterInfo(), LLDumpOpts, *Off);
+    getDebugLoc()->dump(OS, getRegisterInfo(), *DObj, LLDumpOpts, *Off);
   }
   if (const auto *Off =
           shouldDump(Explicit, ".debug_loclists", DIDT_ID_DebugLoclists,
                      DObj->getLoclistsSection().Data)) {
     DWARFDataExtractor Data(*DObj, DObj->getLoclistsSection(), isLittleEndian(),
                             0);
-    dumpLoclistsSection(OS, LLDumpOpts, Data, getRegisterInfo(), *Off);
+    dumpLoclistsSection(OS, LLDumpOpts, Data, getRegisterInfo(), *DObj, *Off);
   }
   if (const auto *Off =
           shouldDump(ExplicitDWO, ".debug_loclists.dwo", DIDT_ID_DebugLoclists,
                      DObj->getLoclistsDWOSection().Data)) {
     DWARFDataExtractor Data(*DObj, DObj->getLoclistsDWOSection(),
                             isLittleEndian(), 0);
-    dumpLoclistsSection(OS, LLDumpOpts, Data, getRegisterInfo(), *Off);
+    dumpLoclistsSection(OS, LLDumpOpts, Data, getRegisterInfo(), *DObj, *Off);
   }
 
   if (const auto *Off =
@@ -420,11 +421,11 @@ void DWARFContext::dump(
     if (*Off) {
       uint64_t Offset = **Off;
       Loc.dumpLocationList(&Offset, OS,
-                           /*BaseAddr=*/None, getRegisterInfo(), nullptr,
+                           /*BaseAddr=*/None, getRegisterInfo(), *DObj, nullptr,
                            LLDumpOpts, /*Indent=*/0);
       OS << "\n";
     } else {
-      Loc.dumpRange(0, Data.getData().size(), OS, getRegisterInfo(),
+      Loc.dumpRange(0, Data.getData().size(), OS, getRegisterInfo(), *DObj,
                     LLDumpOpts);
     }
   }
@@ -441,6 +442,9 @@ void DWARFContext::dump(
     if (Explicit || !getDebugMacro()->empty()) {
       OS << "\n.debug_macinfo contents:\n";
       getDebugMacro()->dump(OS);
+    } else if (ExplicitDWO || !getDebugMacroDWO()->empty()) {
+      OS << "\n.debug_macinfo.dwo contents:\n";
+      getDebugMacroDWO()->dump(OS);
     }
   }
 
@@ -797,6 +801,17 @@ const DWARFDebugFrame *DWARFContext::getEHFrame() {
   return DebugFrame.get();
 }
 
+const DWARFDebugMacro *DWARFContext::getDebugMacroDWO() {
+  if (MacroDWO)
+    return MacroDWO.get();
+
+  DataExtractor MacinfoDWOData(DObj->getMacinfoDWOSection(), isLittleEndian(),
+                               0);
+  MacroDWO.reset(new DWARFDebugMacro());
+  MacroDWO->parse(MacinfoDWOData);
+  return MacroDWO.get();
+}
+
 const DWARFDebugMacro *DWARFContext::getDebugMacro() {
   if (Macro)
     return Macro.get();
@@ -1500,6 +1515,7 @@ class DWARFObjInMemory final : public DWARFObject {
   StringRef ArangesSection;
   StringRef StrSection;
   StringRef MacinfoSection;
+  StringRef MacinfoDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StrDWOSection;
   StringRef CUIndexSection;
@@ -1519,6 +1535,7 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_aranges", &ArangesSection)
         .Case("debug_str", &StrSection)
         .Case("debug_macinfo", &MacinfoSection)
+        .Case("debug_macinfo.dwo", &MacinfoDWOSection)
         .Case("debug_abbrev.dwo", &AbbrevDWOSection)
         .Case("debug_str.dwo", &StrDWOSection)
         .Case("debug_cu_index", &CUIndexSection)
@@ -1845,6 +1862,7 @@ class DWARFObjInMemory final : public DWARFObject {
     return RnglistsSection;
   }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
+  StringRef getMacinfoDWOSection() const override { return MacinfoDWOSection; }
   const DWARFSection &getPubnamesSection() const override { return PubnamesSection; }
   const DWARFSection &getPubtypesSection() const override { return PubtypesSection; }
   const DWARFSection &getGnuPubnamesSection() const override {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index baa35eb813cf5..8aed9ab653a16 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -57,6 +57,17 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
       return createResolverError(E.Value0, E.Kind);
     return None;
   }
+  case dwarf::DW_LLE_startx_endx: {
+    Optional<SectionedAddress> LowPC = LookupAddr(E.Value0);
+    if (!LowPC)
+      return createResolverError(E.Value0, E.Kind);
+    Optional<SectionedAddress> HighPC = LookupAddr(E.Value1);
+    if (!HighPC)
+      return createResolverError(E.Value1, E.Kind);
+    return DWARFLocationExpression{
+        DWARFAddressRange{LowPC->Address, HighPC->Address, LowPC->SectionIndex},
+        E.Loc};
+  }
   case dwarf::DW_LLE_startx_length: {
     Optional<SectionedAddress> LowPC = LookupAddr(E.Value0);
     if (!LowPC)
@@ -66,23 +77,29 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
                                                      LowPC->SectionIndex},
                                    E.Loc};
   }
-  case dwarf::DW_LLE_offset_pair:
+  case dwarf::DW_LLE_offset_pair: {
     if (!Base) {
       return createStringError(
           inconvertibleErrorCode(),
           "Unable to resolve DW_LLE_offset_pair: base address unknown");
     }
-    return DWARFLocationExpression{DWARFAddressRange{Base->Address + E.Value0,
-                                                     Base->Address + E.Value1,
-                                                     Base->SectionIndex},
-                                   E.Loc};
+    DWARFAddressRange Range{Base->Address + E.Value0, Base->Address + E.Value1,
+                            Base->SectionIndex};
+    if (Range.SectionIndex == SectionedAddress::UndefSection)
+      Range.SectionIndex = E.SectionIndex;
+    return DWARFLocationExpression{Range, E.Loc};
+  }
+  case dwarf::DW_LLE_default_location:
+    return DWARFLocationExpression{None, E.Loc};
   case dwarf::DW_LLE_base_address:
-    Base = SectionedAddress{E.Value0, SectionedAddress::UndefSection};
+    Base = SectionedAddress{E.Value0, E.SectionIndex};
     return None;
+  case dwarf::DW_LLE_start_end:
+    return DWARFLocationExpression{
+        DWARFAddressRange{E.Value0, E.Value1, E.SectionIndex}, E.Loc};
   case dwarf::DW_LLE_start_length:
     return DWARFLocationExpression{
-        DWARFAddressRange{E.Value0, E.Value0 + E.Value1,
-                          SectionedAddress::UndefSection},
+        DWARFAddressRange{E.Value0, E.Value0 + E.Value1, E.SectionIndex},
         E.Loc};
   default:
     llvm_unreachable("unreachable locations list kind");
@@ -104,7 +121,8 @@ static void dumpExpression(raw_ostream &OS, ArrayRef<uint8_t> Data,
 bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
                                           Optional<SectionedAddress> BaseAddr,
                                           const MCRegisterInfo *MRI,
-                                          DWARFUnit *U, DIDumpOptions DumpOpts,
+                                          const DWARFObject &Obj, DWARFUnit *U,
+                                          DIDumpOptions DumpOpts,
                                           unsigned Indent) const {
   DWARFLocationInterpreter Interp(
       BaseAddr, [U](uint32_t Index) -> Optional<SectionedAddress> {
@@ -116,7 +134,7 @@ bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
   Error E = visitLocationList(Offset, [&](const DWARFLocationEntry &E) {
     Expected<Optional<DWARFLocationExpression>> Loc = Interp.Interpret(E);
     if (!Loc || DumpOpts.DisplayRawContents)
-      dumpRawEntry(E, OS, Indent);
+      dumpRawEntry(E, OS, Indent, DumpOpts, Obj);
     if (Loc && *Loc) {
       OS << "\n";
       OS.indent(Indent);
@@ -125,10 +143,10 @@ bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
 
       DIDumpOptions RangeDumpOpts(DumpOpts);
       RangeDumpOpts.DisplayRawContents = false;
-      const DWARFObject *Obj = nullptr;
-      if (U)
-        Obj = &U->getContext().getDWARFObj();
-      Loc.get()->Range->dump(OS, Data.getAddressSize(), RangeDumpOpts, Obj);
+      if (Loc.get()->Range)
+        Loc.get()->Range->dump(OS, Data.getAddressSize(), RangeDumpOpts, &Obj);
+      else
+        OS << "<default>";
     }
     if (!Loc)
       consumeError(Loc.takeError());
@@ -167,12 +185,12 @@ Error DWARFLocationTable::visitAbsoluteLocationList(
 }
 
 void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
-                         DIDumpOptions DumpOpts,
+                         const DWARFObject &Obj, DIDumpOptions DumpOpts,
                          Optional<uint64_t> DumpOffset) const {
   auto BaseAddr = None;
   unsigned Indent = 12;
   if (DumpOffset) {
-    dumpLocationList(&*DumpOffset, OS, BaseAddr, MRI, nullptr, DumpOpts,
+    dumpLocationList(&*DumpOffset, OS, BaseAddr, MRI, Obj, nullptr, DumpOpts,
                      Indent);
   } else {
     uint64_t Offset = 0;
@@ -182,7 +200,7 @@ void DWARFDebugLoc::dump(raw_ostream &OS, const MCRegisterInfo *MRI,
       OS << Separator;
       Separator = "\n";
 
-      CanContinue = dumpLocationList(&Offset, OS, BaseAddr, MRI, nullptr,
+      CanContinue = dumpLocationList(&Offset, OS, BaseAddr, MRI, Obj, nullptr,
                                      DumpOpts, Indent);
       OS << '\n';
     }
@@ -194,8 +212,9 @@ Error DWARFDebugLoc::visitLocationList(
     function_ref<bool(const DWARFLocationEntry &)> Callback) const {
   DataExtractor::Cursor C(*Offset);
   while (true) {
+    uint64_t SectionIndex;
     uint64_t Value0 = Data.getRelocatedAddress(C);
-    uint64_t Value1 = Data.getRelocatedAddress(C);
+    uint64_t Value1 = Data.getRelocatedAddress(C, &SectionIndex);
 
     DWARFLocationEntry E;
 
@@ -208,10 +227,12 @@ Error DWARFDebugLoc::visitLocationList(
     } else if (Value0 == (Data.getAddressSize() == 4 ? -1U : -1ULL)) {
       E.Kind = dwarf::DW_LLE_base_address;
       E.Value0 = Value1;
+      E.SectionIndex = SectionIndex;
     } else {
       E.Kind = dwarf::DW_LLE_offset_pair;
       E.Value0 = Value0;
       E.Value1 = Value1;
+      E.SectionIndex = SectionIndex;
       unsigned Bytes = Data.getU16(C);
       // A single location description describing the location of the object...
       Data.getU8(C, E.Loc, Bytes);
@@ -227,7 +248,9 @@ Error DWARFDebugLoc::visitLocationList(
 }
 
 void DWARFDebugLoc::dumpRawEntry(const DWARFLocationEntry &Entry,
-                                 raw_ostream &OS, unsigned Indent) const {
+                                 raw_ostream &OS, unsigned Indent,
+                                 DIDumpOptions DumpOpts,
+                                 const DWARFObject &Obj) const {
   uint64_t Value0, Value1;
   switch (Entry.Kind) {
   case dwarf::DW_LLE_base_address:
@@ -248,6 +271,7 @@ void DWARFDebugLoc::dumpRawEntry(const DWARFLocationEntry &Entry,
   OS.indent(Indent);
   OS << '(' << format_hex(Value0, 2 + Data.getAddressSize() * 2) << ", "
      << format_hex(Value1, 2 + Data.getAddressSize() * 2) << ')';
+  DWARFFormValue::dumpAddressSection(Obj, OS, DumpOpts, Entry.SectionIndex);
 }
 
 Error DWARFDebugLoclists::visitLocationList(
@@ -264,6 +288,10 @@ Error DWARFDebugLoclists::visitLocationList(
     case dwarf::DW_LLE_base_addressx:
       E.Value0 = Data.getULEB128(C);
       break;
+    case dwarf::DW_LLE_startx_endx:
+      E.Value0 = Data.getULEB128(C);
+      E.Value1 = Data.getULEB128(C);
+      break;
     case dwarf::DW_LLE_startx_length:
       E.Value0 = Data.getULEB128(C);
       // Pre-DWARF 5 has different interpretation of the length field. We have
@@ -276,17 +304,21 @@ Error DWARFDebugLoclists::visitLocationList(
     case dwarf::DW_LLE_offset_pair:
       E.Value0 = Data.getULEB128(C);
       E.Value1 = Data.getULEB128(C);
+      E.SectionIndex = SectionedAddress::UndefSection;
+      break;
+    case dwarf::DW_LLE_default_location:
       break;
     case dwarf::DW_LLE_base_address:
-      E.Value0 = Data.getRelocatedAddress(C);
+      E.Value0 = Data.getRelocatedAddress(C, &E.SectionIndex);
+      break;
+    case dwarf::DW_LLE_start_end:
+      E.Value0 = Data.getRelocatedAddress(C, &E.SectionIndex);
+      E.Value1 = Data.getRelocatedAddress(C);
       break;
     case dwarf::DW_LLE_start_length:
-      E.Value0 = Data.getRelocatedAddress(C);
+      E.Value0 = Data.getRelocatedAddress(C, &E.SectionIndex);
       E.Value1 = Data.getULEB128(C);
       break;
-    case dwarf::DW_LLE_startx_endx:
-    case dwarf::DW_LLE_default_location:
-    case dwarf::DW_LLE_start_end:
     default:
       cantFail(C.takeError());
       return createStringError(errc::illegal_byte_sequence,
@@ -310,7 +342,9 @@ Error DWARFDebugLoclists::visitLocationList(
 }
 
 void DWARFDebugLoclists::dumpRawEntry(const DWARFLocationEntry &Entry,
-                                      raw_ostream &OS, unsigned Indent) const {
+                                      raw_ostream &OS, unsigned Indent,
+                                      DIDumpOptions DumpOpts,
+                                      const DWARFObject &Obj) const {
   size_t MaxEncodingStringLength = 0;
 #define HANDLE_DW_LLE(ID, NAME)                                                \
   MaxEncodingStringLength = std::max(MaxEncodingStringLength,                  \
@@ -325,9 +359,14 @@ void DWARFDebugLoclists::dumpRawEntry(const DWARFLocationEntry &Entry,
   OS << format("%-*s(", MaxEncodingStringLength, EncodingString.data());
   unsigned FieldSize = 2 + 2 * Data.getAddressSize();
   switch (Entry.Kind) {
+  case dwarf::DW_LLE_end_of_list:
+  case dwarf::DW_LLE_default_location:
+    break;
+  case dwarf::DW_LLE_startx_endx:
   case dwarf::DW_LLE_startx_length:
-  case dwarf::DW_LLE_start_length:
   case dwarf::DW_LLE_offset_pair:
+  case dwarf::DW_LLE_start_end:
+  case dwarf::DW_LLE_start_length:
     OS << format_hex(Entry.Value0, FieldSize) << ", "
        << format_hex(Entry.Value1, FieldSize);
     break;
@@ -335,14 +374,22 @@ void DWARFDebugLoclists::dumpRawEntry(const DWARFLocationEntry &Entry,
   case dwarf::DW_LLE_base_address:
     OS << format_hex(Entry.Value0, FieldSize);
     break;
-  case dwarf::DW_LLE_end_of_list:
-    break;
   }
   OS << ')';
+  switch (Entry.Kind) {
+  case dwarf::DW_LLE_base_address:
+  case dwarf::DW_LLE_start_end:
+  case dwarf::DW_LLE_start_length:
+    DWARFFormValue::dumpAddressSection(Obj, OS, DumpOpts, Entry.SectionIndex);
+    break;
+  default:
+    break;
+  }
 }
 
 void DWARFDebugLoclists::dumpRange(uint64_t StartOffset, uint64_t Size,
                                    raw_ostream &OS, const MCRegisterInfo *MRI,
+                                   const DWARFObject &Obj,
                                    DIDumpOptions DumpOpts) {
   if (!Data.isValidOffsetForDataOfSize(StartOffset, Size))  {
     OS << "Invalid dump range\n";
@@ -355,8 +402,8 @@ void DWARFDebugLoclists::dumpRange(uint64_t StartOffset, uint64_t Size,
     OS << Separator;
     Separator = "\n";
 
-    CanContinue = dumpLocationList(&Offset, OS, /*BaseAddr=*/None, MRI, nullptr,
-                                   DumpOpts, /*Indent=*/12);
+    CanContinue = dumpLocationList(&Offset, OS, /*BaseAddr=*/None, MRI, Obj,
+                                   nullptr, DumpOpts, /*Indent=*/12);
     OS << '\n';
   }
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
index f6785b89e86d4..9ae4c5b73ebe9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugRnglists.cpp
@@ -114,12 +114,21 @@ Error RangeListEntry::extract(DWARFDataExtractor Data, uint64_t End,
 
 DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
     llvm::Optional<object::SectionedAddress> BaseAddr, DWARFUnit &U) const {
+  return getAbsoluteRanges(BaseAddr, [&](uint32_t Index) {
+    return U.getAddrOffsetSectionItem(Index);
+  });
+}
+
+DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
+    Optional<object::SectionedAddress> BaseAddr,
+    function_ref<Optional<object::SectionedAddress>(uint32_t)>
+        LookupPooledAddress) const {
   DWARFAddressRangesVector Res;
   for (const RangeListEntry &RLE : Entries) {
     if (RLE.EntryKind == dwarf::DW_RLE_end_of_list)
       break;
     if (RLE.EntryKind == dwarf::DW_RLE_base_addressx) {
-      BaseAddr = U.getAddrOffsetSectionItem(RLE.Value0);
+      BaseAddr = LookupPooledAddress(RLE.Value0);
       if (!BaseAddr)
         BaseAddr = {RLE.Value0, -1ULL};
       continue;
@@ -152,7 +161,7 @@ DWARFAddressRangesVector DWARFDebugRnglist::getAbsoluteRanges(
       E.HighPC = E.LowPC + RLE.Value1;
       break;
     case dwarf::DW_RLE_startx_length: {
-      auto Start = U.getAddrOffsetSectionItem(RLE.Value0);
+      auto Start = LookupPooledAddress(RLE.Value0);
       if (!Start)
         Start = {0, -1ULL};
       E.SectionIndex = Start->SectionIndex;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index cc3d021b0ddbd..4b86359c04e3f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -91,12 +91,13 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
       FormValue.dump(OS, DumpOpts);
 
       if (auto LoclistOffset = U->getLoclistOffset(Offset))
-        Offset = *LoclistOffset + U->getLocSectionBase();
+        Offset = *LoclistOffset;
       else
         return;
     }
     U->getLocationTable().dumpLocationList(&Offset, OS, U->getBaseAddress(),
-                                           MRI, U, DumpOpts, Indent);
+                                           MRI, Ctx.getDWARFObj(), U, DumpOpts,
+                                           Indent);
     return;
   }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index b662e88816f8a..4ccda628093c9 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -629,7 +629,7 @@ DWARFUnit::findRnglistFromOffset(uint64_t Offset) {
 Expected<DWARFAddressRangesVector>
 DWARFUnit::findRnglistFromIndex(uint32_t Index) {
   if (auto Offset = getRnglistOffset(Index))
-    return findRnglistFromOffset(*Offset + RangeSectionBase);
+    return findRnglistFromOffset(*Offset);
 
   if (RngListTable)
     return createStringError(errc::invalid_argument,
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index be79d9e637c14..cb076aed3aac4 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -284,6 +284,79 @@ bool darwinDsymMatchesBinary(const MachOObjectFile *DbgObj,
   return !memcmp(dbg_uuid.data(), bin_uuid.data(), dbg_uuid.size());
 }
 
+template <typename ELFT>
+Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> *Obj) {
+  if (!Obj)
+    return {};
+  auto PhdrsOrErr = Obj->program_headers();
+  if (!PhdrsOrErr) {
+    consumeError(PhdrsOrErr.takeError());
+    return {};
+  }
+  for (const auto &P : *PhdrsOrErr) {
+    if (P.p_type != ELF::PT_NOTE)
+      continue;
+    Error Err = Error::success();
+    for (const auto &N : Obj->notes(P, Err))
+      if (N.getType() == ELF::NT_GNU_BUILD_ID && N.getName() == ELF::ELF_NOTE_GNU)
+        return N.getDesc();
+  }
+  return {};
+}
+
+Optional<ArrayRef<uint8_t>> getBuildID(const ELFObjectFileBase *Obj) {
+  Optional<ArrayRef<uint8_t>> BuildID;
+  if (auto *O = dyn_cast<ELFObjectFile<ELF32LE>>(Obj))
+    BuildID = getBuildID(O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF32BE>>(Obj))
+    BuildID = getBuildID(O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF64LE>>(Obj))
+    BuildID = getBuildID(O->getELFFile());
+  else if (auto *O = dyn_cast<ELFObjectFile<ELF64BE>>(Obj))
+    BuildID = getBuildID(O->getELFFile());
+  else
+    llvm_unreachable("unsupported file format");
+  return BuildID;
+}
+
+bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory,
+                     const ArrayRef<uint8_t> BuildID,
+                     std::string &Result) {
+  auto getDebugPath = [&](StringRef Directory) {
+    SmallString<128> Path{Directory};
+    sys::path::append(Path, ".build-id",
+                      llvm::toHex(BuildID[0], /*LowerCase=*/true),
+                      llvm::toHex(BuildID.slice(1), /*LowerCase=*/true));
+    Path += ".debug";
+    return Path;
+  };
+  if (DebugFileDirectory.empty()) {
+    SmallString<128> Path = getDebugPath(
+#if defined(__NetBSD__)
+      // Try /usr/libdata/debug/.build-id/../...
+      "/usr/libdata/debug"
+#else
+      // Try /usr/lib/debug/.build-id/../...
+      "/usr/lib/debug"
+#endif
+    );
+    if (llvm::sys::fs::exists(Path)) {
+      Result = Path.str();
+      return true;
+    }
+  } else {
+    for (const auto &Directory : DebugFileDirectory) {
+      // Try <debug-file-directory>/.build-id/../...
+      SmallString<128> Path = getDebugPath(Directory);
+      if (llvm::sys::fs::exists(Path)) {
+        Result = Path.str();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 } // end anonymous namespace
 
 ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
@@ -335,6 +408,25 @@ ObjectFile *LLVMSymbolizer::lookUpDebuglinkObject(const std::string &Path,
   return DbgObjOrErr.get();
 }
 
+ObjectFile *LLVMSymbolizer::lookUpBuildIDObject(const std::string &Path,
+                                                const ELFObjectFileBase *Obj,
+                                                const std::string &ArchName) {
+  auto BuildID = getBuildID(Obj);
+  if (!BuildID)
+    return nullptr;
+  if (BuildID->size() < 2)
+    return nullptr;
+  std::string DebugBinaryPath;
+  if (!findDebugBinary(Opts.DebugFileDirectory, *BuildID, DebugBinaryPath))
+    return nullptr;
+  auto DbgObjOrErr = getOrCreateObject(DebugBinaryPath, ArchName);
+  if (!DbgObjOrErr) {
+    consumeError(DbgObjOrErr.takeError());
+    return nullptr;
+  }
+  return DbgObjOrErr.get();
+}
+
 Expected<LLVMSymbolizer::ObjectPair>
 LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
                                       const std::string &ArchName) {
@@ -355,6 +447,8 @@ LLVMSymbolizer::getOrCreateObjectPair(const std::string &Path,
 
   if (auto MachObj = dyn_cast<const MachOObjectFile>(Obj))
     DbgObj = lookUpDsymFile(Path, MachObj, ArchName);
+  else if (auto ELFObj = dyn_cast<const ELFObjectFileBase>(Obj))
+    DbgObj = lookUpBuildIDObject(Path, ELFObj, ArchName);
   if (!DbgObj)
     DbgObj = lookUpDebuglinkObject(Path, Obj, ArchName);
   if (!DbgObj)
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 9df79670d9fba..6c924f8895776 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -266,6 +266,16 @@ void LinkGraph::dump(raw_ostream &OS,
        << "\n";
 }
 
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LF) {
+  switch (LF) {
+  case SymbolLookupFlags::RequiredSymbol:
+    return OS << "RequiredSymbol";
+  case SymbolLookupFlags::WeaklyReferencedSymbol:
+    return OS << "WeaklyReferencedSymbol";
+  }
+  llvm_unreachable("Unrecognized lookup flags");
+}
+
 void JITLinkAsyncLookupContinuation::anchor() {}
 
 JITLinkContext::~JITLinkContext() {}
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 9707b9624d936..7b594fd2c0ea9 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -257,25 +257,35 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
   return Error::success();
 }
 
-DenseSet<StringRef> JITLinkerBase::getExternalSymbolNames() const {
+JITLinkContext::LookupMap JITLinkerBase::getExternalSymbolNames() const {
   // Identify unresolved external symbols.
-  DenseSet<StringRef> UnresolvedExternals;
+  JITLinkContext::LookupMap UnresolvedExternals;
   for (auto *Sym : G->external_symbols()) {
     assert(Sym->getAddress() == 0 &&
            "External has already been assigned an address");
     assert(Sym->getName() != StringRef() && Sym->getName() != "" &&
            "Externals must be named");
-    UnresolvedExternals.insert(Sym->getName());
+    SymbolLookupFlags LookupFlags =
+        Sym->getLinkage() == Linkage::Weak
+            ? SymbolLookupFlags::WeaklyReferencedSymbol
+            : SymbolLookupFlags::RequiredSymbol;
+    UnresolvedExternals[Sym->getName()] = LookupFlags;
   }
   return UnresolvedExternals;
 }
 
 void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
   for (auto *Sym : G->external_symbols()) {
+    assert(Sym->getOffset() == 0 &&
+           "External symbol is not at the start of its addressable block");
     assert(Sym->getAddress() == 0 && "Symbol already resolved");
     assert(!Sym->isDefined() && "Symbol being resolved is already defined");
-    assert(Result.count(Sym->getName()) && "Missing resolution for symbol");
-    Sym->getAddressable().setAddress(Result[Sym->getName()].getAddress());
+    auto ResultI = Result.find(Sym->getName());
+    if (ResultI != Result.end())
+      Sym->getAddressable().setAddress(ResultI->second.getAddress());
+    else
+      assert(Sym->getLinkage() == Linkage::Weak &&
+             "Failed to resolve non-weak reference");
   }
 
   LLVM_DEBUG({
@@ -285,8 +295,11 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
              << formatv("{0:x16}", Sym->getAddress()) << "\n";
   });
   assert(llvm::all_of(G->external_symbols(),
-                      [](Symbol *Sym) { return Sym->getAddress() != 0; }) &&
-         "All symbols should have been resolved by this point");
+                      [](Symbol *Sym) {
+                        return Sym->getAddress() != 0 ||
+                               Sym->getLinkage() == Linkage::Weak;
+                      }) &&
+         "All strong external symbols should have been resolved by now");
 }
 
 void JITLinkerBase::deallocateAndBailOut(Error Err) {
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index 07dee6cee2002..d5687b7afc967 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -106,7 +106,7 @@ class JITLinkerBase {
 
   SegmentLayoutMap layOutBlocks();
   Error allocateSegments(const SegmentLayoutMap &Layout);
-  DenseSet<StringRef> getExternalSymbolNames() const;
+  JITLinkContext::LookupMap getExternalSymbolNames() const;
   void applyLookupResult(AsyncLookupResult LR);
   void deallocateAndBailOut(Error Err);
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index c1dc138ee7024..1881bd0b287e0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -321,7 +321,9 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
           return make_error<JITLinkError>("Anonymous external symbol at "
                                           "index " +
                                           Twine(KV.first));
-        NSym.GraphSymbol = &G->addExternalSymbol(*NSym.Name, 0);
+        NSym.GraphSymbol = &G->addExternalSymbol(
+            *NSym.Name, 0,
+            NSym.Desc & MachO::N_WEAK_REF ? Linkage::Weak : Linkage::Strong);
       }
       break;
     case MachO::N_ABS:
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 9dbfb6556e317..69ec72aae2928 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -252,7 +252,7 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
-          Addend = *(const ulittle32_t *)FixupContent;
+          Addend = *(const little32_t *)FixupContent;
           break;
         case Pointer32:
           if (auto TargetSymbolOrErr = findSymbolByIndex(RI.r_symbolnum))
@@ -284,12 +284,12 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
             TargetSymbol = TargetSymbolOrErr->GraphSymbol;
           else
             return TargetSymbolOrErr.takeError();
-          Addend = *(const ulittle32_t *)FixupContent +
+          Addend = *(const little32_t *)FixupContent +
                    (1 << (*Kind - PCRel32Minus1));
           break;
         case PCRel32Anon: {
           JITTargetAddress TargetAddress =
-              FixupAddress + 4 + *(const ulittle32_t *)FixupContent;
+              FixupAddress + 4 + *(const little32_t *)FixupContent;
           if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
@@ -303,7 +303,7 @@ class MachOLinkGraphBuilder_x86_64 : public MachOLinkGraphBuilder {
           JITTargetAddress Delta =
               static_cast<JITTargetAddress>(1ULL << (*Kind - PCRel32Minus1Anon));
           JITTargetAddress TargetAddress =
-              FixupAddress + 4 + Delta + *(const ulittle32_t *)FixupContent;
+              FixupAddress + 4 + Delta + *(const little32_t *)FixupContent;
           if (auto TargetSymbolOrErr = findSymbolByAddress(TargetAddress))
             TargetSymbol = &*TargetSymbolOrErr;
           else
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index 75ddbc30445d2..f26835ff8a085 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -162,7 +162,8 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), true));
+  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables),
+                      JITDylibLookupFlags::MatchAllSymbols));
   R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
                           std::move(Callables), AliaseeImpls));
 }
@@ -171,18 +172,22 @@ CompileOnDemandLayer::PerDylibResources &
 CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
-    auto &ImplD = getExecutionSession().createJITDylib(
-        TargetD.getName() + ".impl", false);
-    TargetD.withSearchOrderDo([&](const JITDylibSearchList &TargetSearchOrder) {
-      auto NewSearchOrder = TargetSearchOrder;
-      assert(!NewSearchOrder.empty() &&
-             NewSearchOrder.front().first == &TargetD &&
-             NewSearchOrder.front().second == true &&
-             "TargetD must be at the front of its own search order and match "
-             "non-exported symbol");
-      NewSearchOrder.insert(std::next(NewSearchOrder.begin()), {&ImplD, true});
-      ImplD.setSearchOrder(std::move(NewSearchOrder), false);
-    });
+    auto &ImplD =
+        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
+    TargetD.withSearchOrderDo(
+        [&](const JITDylibSearchOrder &TargetSearchOrder) {
+          auto NewSearchOrder = TargetSearchOrder;
+          assert(
+              !NewSearchOrder.empty() &&
+              NewSearchOrder.front().first == &TargetD &&
+              NewSearchOrder.front().second ==
+                  JITDylibLookupFlags::MatchAllSymbols &&
+              "TargetD must be at the front of its own search order and match "
+              "non-exported symbol");
+          NewSearchOrder.insert(std::next(NewSearchOrder.begin()),
+                                {&ImplD, JITDylibLookupFlags::MatchAllSymbols});
+          ImplD.setSearchOrder(std::move(NewSearchOrder), false);
+        });
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
   }
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 9e024ba0f10f8..63ef889dae464 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/Core.h"
+
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/IR/Mangler.h"
@@ -77,16 +79,19 @@ bool flagsMatchCLOpts(const JITSymbolFlags &Flags) {
 #endif // NDEBUG
 }
 
-// Prints a set of items, filtered by an user-supplied predicate.
-template <typename Set, typename Pred = PrintAll<typename Set::value_type>>
-class SetPrinter {
+// Prints a sequence of items, filtered by an user-supplied predicate.
+template <typename Sequence,
+          typename Pred = PrintAll<typename Sequence::value_type>>
+class SequencePrinter {
 public:
-  SetPrinter(const Set &S, Pred ShouldPrint = Pred())
-      : S(S), ShouldPrint(std::move(ShouldPrint)) {}
+  SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq,
+                  Pred ShouldPrint = Pred())
+      : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq),
+        ShouldPrint(std::move(ShouldPrint)) {}
 
   void printTo(llvm::raw_ostream &OS) const {
     bool PrintComma = false;
-    OS << "{";
+    OS << OpenSeq;
     for (auto &E : S) {
       if (ShouldPrint(E)) {
         if (PrintComma)
@@ -95,23 +100,26 @@ class SetPrinter {
         PrintComma = true;
       }
     }
-    OS << " }";
+    OS << ' ' << CloseSeq;
   }
 
 private:
-  const Set &S;
+  const Sequence &S;
+  char OpenSeq;
+  char CloseSeq;
   mutable Pred ShouldPrint;
 };
 
-template <typename Set, typename Pred>
-SetPrinter<Set, Pred> printSet(const Set &S, Pred P = Pred()) {
-  return SetPrinter<Set, Pred>(S, std::move(P));
+template <typename Sequence, typename Pred>
+SequencePrinter<Sequence, Pred> printSequence(const Sequence &S, char OpenSeq,
+                                              char CloseSeq, Pred P = Pred()) {
+  return SequencePrinter<Sequence, Pred>(S, OpenSeq, CloseSeq, std::move(P));
 }
 
-// Render a SetPrinter by delegating to its printTo method.
-template <typename Set, typename Pred>
+// Render a SequencePrinter by delegating to its printTo method.
+template <typename Sequence, typename Pred>
 llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                              const SetPrinter<Set, Pred> &Printer) {
+                              const SequencePrinter<Sequence, Pred> &Printer) {
   Printer.printTo(OS);
   return OS;
 }
@@ -147,7 +155,11 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
-  return OS << printSet(Symbols, PrintAll<SymbolStringPtr>());
+  return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) {
+  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
@@ -182,11 +194,13 @@ raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
-  return OS << printSet(SymbolFlags, PrintSymbolFlagsMapElemsMatchingCLOpts());
+  return OS << printSequence(SymbolFlags, '{', '}',
+                             PrintSymbolFlagsMapElemsMatchingCLOpts());
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
-  return OS << printSet(Symbols, PrintSymbolMapElemsMatchingCLOpts());
+  return OS << printSequence(Symbols, '{', '}',
+                             PrintSymbolMapElemsMatchingCLOpts());
 }
 
 raw_ostream &operator<<(raw_ostream &OS,
@@ -195,7 +209,8 @@ raw_ostream &operator<<(raw_ostream &OS,
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
-  return OS << printSet(Deps, PrintAll<SymbolDependenceMap::value_type>());
+  return OS << printSequence(Deps, '{', '}',
+                             PrintAll<SymbolDependenceMap::value_type>());
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
@@ -205,16 +220,59 @@ raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
   return OS << ")";
 }
 
-raw_ostream &operator<<(raw_ostream &OS, const JITDylibSearchList &JDs) {
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) {
+  switch (K) {
+  case LookupKind::Static:
+    return OS << "Static";
+  case LookupKind::DLSym:
+    return OS << "DLSym";
+  }
+  llvm_unreachable("Invalid lookup kind");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibLookupFlags &JDLookupFlags) {
+  switch (JDLookupFlags) {
+  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
+    return OS << "MatchExportedSymbolsOnly";
+  case JITDylibLookupFlags::MatchAllSymbols:
+    return OS << "MatchAllSymbols";
+  }
+  llvm_unreachable("Invalid JITDylib lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) {
+  switch (LookupFlags) {
+  case SymbolLookupFlags::RequiredSymbol:
+    return OS << "RequiredSymbol";
+  case SymbolLookupFlags::WeaklyReferencedSymbol:
+    return OS << "WeaklyReferencedSymbol";
+  }
+  llvm_unreachable("Invalid symbol lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolLookupSet::value_type &KV) {
+  return OS << "(" << KV.first << ", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) {
+  return OS << printSequence(LookupSet, '{', '}',
+                             PrintAll<SymbolLookupSet::value_type>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibSearchOrder &SearchOrder) {
   OS << "[";
-  if (!JDs.empty()) {
-    assert(JDs.front().first && "JITDylibList entries must not be null");
-    OS << " (\"" << JDs.front().first->getName() << "\", "
-       << (JDs.front().second ? "true" : "false") << ")";
-    for (auto &KV : make_range(std::next(JDs.begin()), JDs.end())) {
+  if (!SearchOrder.empty()) {
+    assert(SearchOrder.front().first &&
+           "JITDylibList entries must not be null");
+    OS << " (\"" << SearchOrder.front().first->getName() << "\", "
+       << SearchOrder.begin()->second << ")";
+    for (auto &KV :
+         make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) {
       assert(KV.first && "JITDylibList entries must not be null");
-      OS << ", (\"" << KV.first->getName() << "\", "
-         << (KV.second ? "true" : "false") << ")";
+      OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")";
     }
   }
   OS << " ]";
@@ -262,7 +320,13 @@ void FailedToMaterialize::log(raw_ostream &OS) const {
   OS << "Failed to materialize symbols: " << *Symbols;
 }
 
-SymbolsNotFound::SymbolsNotFound(SymbolNameSet Symbols)
+SymbolsNotFound::SymbolsNotFound(SymbolNameSet Symbols) {
+  for (auto &Sym : Symbols)
+    this->Symbols.push_back(Sym);
+  assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
+}
+
+SymbolsNotFound::SymbolsNotFound(SymbolNameVector Symbols)
     : Symbols(std::move(Symbols)) {
   assert(!this->Symbols.empty() && "Can not fail to resolve an empty set");
 }
@@ -289,7 +353,7 @@ void SymbolsCouldNotBeRemoved::log(raw_ostream &OS) const {
 }
 
 AsynchronousSymbolQuery::AsynchronousSymbolQuery(
-    const SymbolNameSet &Symbols, SymbolState RequiredState,
+    const SymbolLookupSet &Symbols, SymbolState RequiredState,
     SymbolsResolvedCallback NotifyComplete)
     : NotifyComplete(std::move(NotifyComplete)), RequiredState(RequiredState) {
   assert(RequiredState >= SymbolState::Resolved &&
@@ -298,8 +362,8 @@ AsynchronousSymbolQuery::AsynchronousSymbolQuery(
 
   OutstandingSymbolsCount = Symbols.size();
 
-  for (auto &S : Symbols)
-    ResolvedSymbols[S] = nullptr;
+  for (auto &KV : Symbols)
+    ResolvedSymbols[KV.first] = nullptr;
 }
 
 void AsynchronousSymbolQuery::notifySymbolMetRequiredState(
@@ -511,10 +575,10 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 }
 
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
-    JITDylib *SourceJD, bool MatchNonExported, SymbolAliasMap Aliases,
-    VModuleKey K)
+    JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags,
+    SymbolAliasMap Aliases, VModuleKey K)
     : MaterializationUnit(extractFlags(Aliases), std::move(K)),
-      SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+      SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags),
       Aliases(std::move(Aliases)) {}
 
 StringRef ReExportsMaterializationUnit::getName() const {
@@ -551,7 +615,7 @@ void ReExportsMaterializationUnit::materialize(
 
   if (!Aliases.empty()) {
     if (SourceJD)
-      R.replace(reexports(*SourceJD, std::move(Aliases), MatchNonExported));
+      R.replace(reexports(*SourceJD, std::move(Aliases), SourceJDLookupFlags));
     else
       R.replace(symbolAliases(std::move(Aliases)));
   }
@@ -572,11 +636,11 @@ void ReExportsMaterializationUnit::materialize(
   // be waitin on a symbol that it itself had to resolve. Usually this will just
   // involve one round and a single query.
 
-  std::vector<std::pair<SymbolNameSet, std::shared_ptr<OnResolveInfo>>>
+  std::vector<std::pair<SymbolLookupSet, std::shared_ptr<OnResolveInfo>>>
       QueryInfos;
   while (!RequestedAliases.empty()) {
     SymbolNameSet ResponsibilitySymbols;
-    SymbolNameSet QuerySymbols;
+    SymbolLookupSet QuerySymbols;
     SymbolAliasMap QueryAliases;
 
     // Collect as many aliases as we can without including a chain.
@@ -587,7 +651,7 @@ void ReExportsMaterializationUnit::materialize(
         continue;
 
       ResponsibilitySymbols.insert(KV.first);
-      QuerySymbols.insert(KV.second.Aliasee);
+      QuerySymbols.add(KV.second.Aliasee);
       QueryAliases[KV.first] = std::move(KV.second);
     }
 
@@ -657,8 +721,9 @@ void ReExportsMaterializationUnit::materialize(
       }
     };
 
-    ES.lookup(JITDylibSearchList({{&SrcJD, MatchNonExported}}), QuerySymbols,
-              SymbolState::Resolved, std::move(OnComplete),
+    ES.lookup(LookupKind::Static,
+              JITDylibSearchOrder({{&SrcJD, SourceJDLookupFlags}}),
+              QuerySymbols, SymbolState::Resolved, std::move(OnComplete),
               std::move(RegisterDependencies));
   }
 }
@@ -681,16 +746,16 @@ ReExportsMaterializationUnit::extractFlags(const SymbolAliasMap &Aliases) {
 
 Expected<SymbolAliasMap>
 buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
-  auto Flags = SourceJD.lookupFlags(Symbols);
+  SymbolLookupSet LookupSet(Symbols);
+  auto Flags = SourceJD.lookupFlags(
+      LookupKind::Static, JITDylibLookupFlags::MatchAllSymbols, LookupSet);
 
   if (!Flags)
     return Flags.takeError();
 
-  if (Flags->size() != Symbols.size()) {
-    SymbolNameSet Unresolved = Symbols;
-    for (auto &KV : *Flags)
-      Unresolved.erase(KV.first);
-    return make_error<SymbolsNotFound>(std::move(Unresolved));
+  if (!LookupSet.empty()) {
+    LookupSet.sortByName();
+    return make_error<SymbolsNotFound>(LookupSet.getSymbolNames());
   }
 
   SymbolAliasMap Result;
@@ -703,32 +768,32 @@ buildSimpleReexportsAliasMap(JITDylib &SourceJD, const SymbolNameSet &Symbols) {
 }
 
 ReexportsGenerator::ReexportsGenerator(JITDylib &SourceJD,
-                                       bool MatchNonExported,
+                                       JITDylibLookupFlags SourceJDLookupFlags,
                                        SymbolPredicate Allow)
-    : SourceJD(SourceJD), MatchNonExported(MatchNonExported),
+    : SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags),
       Allow(std::move(Allow)) {}
 
-Expected<SymbolNameSet>
-ReexportsGenerator::tryToGenerate(JITDylib &JD, const SymbolNameSet &Names) {
-  orc::SymbolNameSet Added;
-  orc::SymbolAliasMap AliasMap;
-
-  auto Flags = SourceJD.lookupFlags(Names);
+Error ReexportsGenerator::tryToGenerate(LookupKind K, JITDylib &JD,
+                                        JITDylibLookupFlags JDLookupFlags,
+                                        const SymbolLookupSet &LookupSet) {
+  assert(&JD != &SourceJD && "Cannot re-export from the same dylib");
 
+  // Use lookupFlags to find the subset of symbols that match our lookup.
+  auto Flags = SourceJD.lookupFlags(K, JDLookupFlags, LookupSet);
   if (!Flags)
     return Flags.takeError();
 
-  for (auto &KV : *Flags) {
-    if (Allow && !Allow(KV.first))
-      continue;
-    AliasMap[KV.first] = SymbolAliasMapEntry(KV.first, KV.second);
-    Added.insert(KV.first);
-  }
+  // Create an alias map.
+  orc::SymbolAliasMap AliasMap;
+  for (auto &KV : *Flags)
+    if (!Allow || Allow(KV.first))
+      AliasMap[KV.first] = SymbolAliasMapEntry(KV.first, KV.second);
 
-  if (!Added.empty())
-    cantFail(JD.define(reexports(SourceJD, AliasMap, MatchNonExported)));
+  if (AliasMap.empty())
+    return Error::success();
 
-  return Added;
+  // Define the re-exports.
+  return JD.define(reexports(SourceJD, AliasMap, SourceJDLookupFlags));
 }
 
 JITDylib::DefinitionGenerator::~DefinitionGenerator() {}
@@ -1252,41 +1317,41 @@ void JITDylib::notifyFailed(FailedSymbolsWorklist Worklist) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbolsMap));
 }
 
-void JITDylib::setSearchOrder(JITDylibSearchList NewSearchOrder,
-                              bool SearchThisJITDylibFirst,
-                              bool MatchNonExportedInThisDylib) {
-  if (SearchThisJITDylibFirst) {
-    if (NewSearchOrder.empty() || NewSearchOrder.front().first != this)
-      NewSearchOrder.insert(NewSearchOrder.begin(),
-                            {this, MatchNonExportedInThisDylib});
-  }
-
-  ES.runSessionLocked([&]() { SearchOrder = std::move(NewSearchOrder); });
-}
-
-void JITDylib::addToSearchOrder(JITDylib &JD, bool MatchNonExported) {
+void JITDylib::setSearchOrder(JITDylibSearchOrder NewSearchOrder,
+                              bool SearchThisJITDylibFirst) {
   ES.runSessionLocked([&]() {
-    SearchOrder.push_back({&JD, MatchNonExported});
+    if (SearchThisJITDylibFirst) {
+      SearchOrder.clear();
+      if (NewSearchOrder.empty() || NewSearchOrder.front().first != this)
+        SearchOrder.push_back(
+            std::make_pair(this, JITDylibLookupFlags::MatchAllSymbols));
+      SearchOrder.insert(SearchOrder.end(), NewSearchOrder.begin(),
+                         NewSearchOrder.end());
+    } else
+      SearchOrder = std::move(NewSearchOrder);
   });
 }
 
+void JITDylib::addToSearchOrder(JITDylib &JD,
+                                JITDylibLookupFlags JDLookupFlags) {
+  ES.runSessionLocked([&]() { SearchOrder.push_back({&JD, JDLookupFlags}); });
+}
+
 void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
-                                    bool MatchNonExported) {
+                                    JITDylibLookupFlags JDLookupFlags) {
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
-                          [&](const JITDylibSearchList::value_type &KV) {
-                            return KV.first == &OldJD;
-                          });
-
-    if (I != SearchOrder.end())
-      *I = {&NewJD, MatchNonExported};
+    for (auto &KV : SearchOrder)
+      if (KV.first == &OldJD) {
+        KV = {&NewJD, JDLookupFlags};
+        break;
+      }
   });
 }
 
 void JITDylib::removeFromSearchOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
     auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
-                          [&](const JITDylibSearchList::value_type &KV) {
+                          [&](const JITDylibSearchOrder::value_type &KV) {
                             return KV.first == &JD;
                           });
     if (I != SearchOrder.end())
@@ -1349,63 +1414,54 @@ Error JITDylib::remove(const SymbolNameSet &Names) {
   });
 }
 
-Expected<SymbolFlagsMap> JITDylib::lookupFlags(const SymbolNameSet &Names) {
+Expected<SymbolFlagsMap>
+JITDylib::lookupFlags(LookupKind K, JITDylibLookupFlags JDLookupFlags,
+                      SymbolLookupSet LookupSet) {
   return ES.runSessionLocked([&, this]() -> Expected<SymbolFlagsMap> {
     SymbolFlagsMap Result;
-    auto Unresolved = lookupFlagsImpl(Result, Names);
-    if (!Unresolved)
-      return Unresolved.takeError();
+    lookupFlagsImpl(Result, K, JDLookupFlags, LookupSet);
 
-    /// Run any definition generators.
+    // Run any definition generators.
     for (auto &DG : DefGenerators) {
 
-      // Bail out early if we've resolved everything.
-      if (Unresolved->empty())
+      // Bail out early if we found everything.
+      if (LookupSet.empty())
         break;
 
       // Run this generator.
-      auto NewDefs = DG->tryToGenerate(*this, *Unresolved);
-      if (!NewDefs)
-        return NewDefs.takeError();
-
-      if (!NewDefs->empty()) {
-        auto Unresolved2 = lookupFlagsImpl(Result, *NewDefs);
-        if (!Unresolved2)
-          return Unresolved2.takeError();
-        (void)Unresolved2;
-        assert(Unresolved2->empty() &&
-               "All fallback defs should have been found by lookupFlagsImpl");
-      }
+      if (auto Err = DG->tryToGenerate(K, *this, JDLookupFlags, LookupSet))
+        return std::move(Err);
 
-      for (auto &Name : *NewDefs)
-        Unresolved->erase(Name);
+      // Re-try the search.
+      lookupFlagsImpl(Result, K, JDLookupFlags, LookupSet);
     }
+
     return Result;
   });
 }
 
-Expected<SymbolNameSet> JITDylib::lookupFlagsImpl(SymbolFlagsMap &Flags,
-                                                  const SymbolNameSet &Names) {
-  SymbolNameSet Unresolved;
-
-  for (auto &Name : Names) {
-    auto I = Symbols.find(Name);
-    if (I != Symbols.end()) {
-      assert(!Flags.count(Name) && "Symbol already present in Flags map");
-      Flags[Name] = I->second.getFlags();
-    } else
-      Unresolved.insert(Name);
-  }
+void JITDylib::lookupFlagsImpl(SymbolFlagsMap &Result, LookupKind K,
+                               JITDylibLookupFlags JDLookupFlags,
+                               SymbolLookupSet &LookupSet) {
 
-  return Unresolved;
+  LookupSet.forEachWithRemoval(
+      [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) -> bool {
+        auto I = Symbols.find(Name);
+        if (I == Symbols.end())
+          return false;
+        assert(!Result.count(Name) && "Symbol already present in Flags map");
+        Result[Name] = I->second.getFlags();
+        return true;
+      });
 }
 
-Error JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
-                           SymbolNameSet &Unresolved, bool MatchNonExported,
-                           MaterializationUnitList &MUs) {
+Error JITDylib::lodgeQuery(MaterializationUnitList &MUs,
+                           std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                           LookupKind K, JITDylibLookupFlags JDLookupFlags,
+                           SymbolLookupSet &Unresolved) {
   assert(Q && "Query can not be null");
 
-  if (auto Err = lodgeQueryImpl(Q, Unresolved, MatchNonExported, MUs))
+  if (auto Err = lodgeQueryImpl(MUs, Q, K, JDLookupFlags, Unresolved))
     return Err;
 
   // Run any definition generators.
@@ -1416,104 +1472,86 @@ Error JITDylib::lodgeQuery(std::shared_ptr<AsynchronousSymbolQuery> &Q,
       break;
 
     // Run the generator.
-    auto NewDefs = DG->tryToGenerate(*this, Unresolved);
-
-    // If the generator returns an error then bail out.
-    if (!NewDefs)
-      return NewDefs.takeError();
-
-    // If the generator was able to generate new definitions for any of the
-    // unresolved symbols then lodge the query against them.
-    if (!NewDefs->empty()) {
-      for (auto &D : *NewDefs)
-        Unresolved.erase(D);
-
-      // Lodge query. This can not fail as any new definitions were added
-      // by the generator under the session locked. Since they can't have
-      // started materializing yet the can not have failed.
-      cantFail(lodgeQueryImpl(Q, *NewDefs, MatchNonExported, MUs));
+    if (auto Err = DG->tryToGenerate(K, *this, JDLookupFlags, Unresolved))
+      return Err;
 
-      assert(NewDefs->empty() &&
-             "All fallback defs should have been found by lookupImpl");
-    }
+    // Lodge query. This can not fail as any new definitions were added
+    // by the generator under the session locked. Since they can't have
+    // started materializing yet they can not have failed.
+    cantFail(lodgeQueryImpl(MUs, Q, K, JDLookupFlags, Unresolved));
   }
 
   return Error::success();
 }
 
-Error JITDylib::lodgeQueryImpl(
-    std::shared_ptr<AsynchronousSymbolQuery> &Q, SymbolNameSet &Unresolved,
-    bool MatchNonExported,
-    std::vector<std::unique_ptr<MaterializationUnit>> &MUs) {
-
-  std::vector<SymbolStringPtr> ToRemove;
-  for (auto Name : Unresolved) {
-
-    // Search for the name in Symbols. Skip it if not found.
-    auto SymI = Symbols.find(Name);
-    if (SymI == Symbols.end())
-      continue;
-
-    // If this is a non exported symbol and we're skipping those then skip it.
-    if (!SymI->second.getFlags().isExported() && !MatchNonExported)
-      continue;
-
-    // If we matched against Name in JD, mark it to be removed from the
-    // Unresolved set.
-    ToRemove.push_back(Name);
-
-    // If we matched against this symbol but it is in the error state then
-    // bail out and treat it as a failure to materialize.
-    if (SymI->second.getFlags().hasError()) {
-      auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
-      (*FailedSymbolsMap)[this] = {Name};
-      return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
-    }
-
-    // If this symbol already meets the required state for then notify the
-    // query and continue.
-    if (SymI->second.getState() >= Q->getRequiredState()) {
-      Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
-      continue;
-    }
-
-    // Otherwise this symbol does not yet meet the required state. Check whether
-    // it has a materializer attached, and if so prepare to run it.
-    if (SymI->second.hasMaterializerAttached()) {
-      assert(SymI->second.getAddress() == 0 &&
-             "Symbol not resolved but already has address?");
-      auto UMII = UnmaterializedInfos.find(Name);
-      assert(UMII != UnmaterializedInfos.end() &&
-             "Lazy symbol should have UnmaterializedInfo");
-      auto MU = std::move(UMII->second->MU);
-      assert(MU != nullptr && "Materializer should not be null");
-
-      // Move all symbols associated with this MaterializationUnit into
-      // materializing state.
-      for (auto &KV : MU->getSymbols()) {
-        auto SymK = Symbols.find(KV.first);
-        SymK->second.setMaterializerAttached(false);
-        SymK->second.setState(SymbolState::Materializing);
-        UnmaterializedInfos.erase(KV.first);
-      }
+Error JITDylib::lodgeQueryImpl(MaterializationUnitList &MUs,
+                               std::shared_ptr<AsynchronousSymbolQuery> &Q,
+                               LookupKind K, JITDylibLookupFlags JDLookupFlags,
+                               SymbolLookupSet &Unresolved) {
+
+  return Unresolved.forEachWithRemoval(
+      [&](const SymbolStringPtr &Name,
+          SymbolLookupFlags SymLookupFlags) -> Expected<bool> {
+        // Search for name in symbols. If not found then continue without
+        // removal.
+        auto SymI = Symbols.find(Name);
+        if (SymI == Symbols.end())
+          return false;
+
+        // If this is a non exported symbol and we're matching exported symbols
+        // only then skip this symbol without removal.
+        if (!SymI->second.getFlags().isExported() &&
+            JDLookupFlags == JITDylibLookupFlags::MatchExportedSymbolsOnly)
+          return false;
+
+        // If we matched against this symbol but it is in the error state then
+        // bail out and treat it as a failure to materialize.
+        if (SymI->second.getFlags().hasError()) {
+          auto FailedSymbolsMap = std::make_shared<SymbolDependenceMap>();
+          (*FailedSymbolsMap)[this] = {Name};
+          return make_error<FailedToMaterialize>(std::move(FailedSymbolsMap));
+        }
 
-      // Add MU to the list of MaterializationUnits to be materialized.
-      MUs.push_back(std::move(MU));
-    }
+        // If this symbol already meets the required state for then notify the
+        // query, then remove the symbol and continue.
+        if (SymI->second.getState() >= Q->getRequiredState()) {
+          Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
+          return true;
+        }
 
-    // Add the query to the PendingQueries list.
-    assert(SymI->second.isInMaterializationPhase() &&
-           "By this line the symbol should be materializing");
-    auto &MI = MaterializingInfos[Name];
-    MI.addQuery(Q);
-    Q->addQueryDependence(*this, Name);
-  }
+        // Otherwise this symbol does not yet meet the required state. Check
+        // whether it has a materializer attached, and if so prepare to run it.
+        if (SymI->second.hasMaterializerAttached()) {
+          assert(SymI->second.getAddress() == 0 &&
+                 "Symbol not resolved but already has address?");
+          auto UMII = UnmaterializedInfos.find(Name);
+          assert(UMII != UnmaterializedInfos.end() &&
+                 "Lazy symbol should have UnmaterializedInfo");
+          auto MU = std::move(UMII->second->MU);
+          assert(MU != nullptr && "Materializer should not be null");
+
+          // Move all symbols associated with this MaterializationUnit into
+          // materializing state.
+          for (auto &KV : MU->getSymbols()) {
+            auto SymK = Symbols.find(KV.first);
+            SymK->second.setMaterializerAttached(false);
+            SymK->second.setState(SymbolState::Materializing);
+            UnmaterializedInfos.erase(KV.first);
+          }
 
-  // Remove any symbols that we found.
-  for (auto &Name : ToRemove)
-    Unresolved.erase(Name);
+          // Add MU to the list of MaterializationUnits to be materialized.
+          MUs.push_back(std::move(MU));
+        }
 
-  return Error::success();
+        // Add the query to the PendingQueries list and continue, deleting the
+        // element.
+        assert(SymI->second.isInMaterializationPhase() &&
+               "By this line the symbol should be materializing");
+        auto &MI = MaterializingInfos[Name];
+        MI.addQuery(Q);
+        Q->addQueryDependence(*this, Name);
+        return true;
+      });
 }
 
 Expected<SymbolNameSet>
@@ -1526,7 +1564,7 @@ JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   bool QueryComplete = false;
   std::vector<std::unique_ptr<MaterializationUnit>> MUs;
 
-  SymbolNameSet Unresolved = std::move(Names);
+  SymbolLookupSet Unresolved(Names);
   auto Err = ES.runSessionLocked([&, this]() -> Error {
     QueryComplete = lookupImpl(Q, MUs, Unresolved);
 
@@ -1538,16 +1576,13 @@ JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
         break;
 
       assert(!QueryComplete && "query complete but unresolved symbols remain?");
-      auto NewDefs = DG->tryToGenerate(*this, Unresolved);
-      if (!NewDefs)
-        return NewDefs.takeError();
-      if (!NewDefs->empty()) {
-        for (auto &D : *NewDefs)
-          Unresolved.erase(D);
-        QueryComplete = lookupImpl(Q, MUs, *NewDefs);
-        assert(NewDefs->empty() &&
-               "All fallback defs should have been found by lookupImpl");
-      }
+      if (auto Err = DG->tryToGenerate(LookupKind::Static, *this,
+                                       JITDylibLookupFlags::MatchAllSymbols,
+                                       Unresolved))
+        return Err;
+
+      if (!Unresolved.empty())
+        QueryComplete = lookupImpl(Q, MUs, Unresolved);
     }
     return Error::success();
   });
@@ -1575,68 +1610,68 @@ JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   // for (auto &MU : MUs)
   //  ES.dispatchMaterialization(*this, std::move(MU));
 
-  return Unresolved;
+  SymbolNameSet RemainingSymbols;
+  for (auto &KV : Unresolved)
+    RemainingSymbols.insert(KV.first);
+
+  return RemainingSymbols;
 }
 
 bool JITDylib::lookupImpl(
     std::shared_ptr<AsynchronousSymbolQuery> &Q,
     std::vector<std::unique_ptr<MaterializationUnit>> &MUs,
-    SymbolNameSet &Unresolved) {
+    SymbolLookupSet &Unresolved) {
   bool QueryComplete = false;
 
   std::vector<SymbolStringPtr> ToRemove;
-  for (auto Name : Unresolved) {
-
-    // Search for the name in Symbols. Skip it if not found.
-    auto SymI = Symbols.find(Name);
-    if (SymI == Symbols.end())
-      continue;
-
-    // If we found Name, mark it to be removed from the Unresolved set.
-    ToRemove.push_back(Name);
-
-    if (SymI->second.getState() >= Q->getRequiredState()) {
-      Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
-      if (Q->isComplete())
-        QueryComplete = true;
-      continue;
-    }
-
-    // If the symbol is lazy, get the MaterialiaztionUnit for it.
-    if (SymI->second.hasMaterializerAttached()) {
-      assert(SymI->second.getAddress() == 0 &&
-             "Lazy symbol should not have a resolved address");
-      auto UMII = UnmaterializedInfos.find(Name);
-      assert(UMII != UnmaterializedInfos.end() &&
-             "Lazy symbol should have UnmaterializedInfo");
-      auto MU = std::move(UMII->second->MU);
-      assert(MU != nullptr && "Materializer should not be null");
-
-      // Kick all symbols associated with this MaterializationUnit into
-      // materializing state.
-      for (auto &KV : MU->getSymbols()) {
-        auto SymK = Symbols.find(KV.first);
-        assert(SymK != Symbols.end() && "Missing symbol table entry");
-        SymK->second.setState(SymbolState::Materializing);
-        SymK->second.setMaterializerAttached(false);
-        UnmaterializedInfos.erase(KV.first);
-      }
+  Unresolved.forEachWithRemoval(
+      [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) -> bool {
+        // Search for the name in Symbols. Skip without removing if not found.
+        auto SymI = Symbols.find(Name);
+        if (SymI == Symbols.end())
+          return false;
+
+        // If the symbol is already in the required state then notify the query
+        // and remove.
+        if (SymI->second.getState() >= Q->getRequiredState()) {
+          Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
+          if (Q->isComplete())
+            QueryComplete = true;
+          return true;
+        }
 
-      // Add MU to the list of MaterializationUnits to be materialized.
-      MUs.push_back(std::move(MU));
-    }
+        // If the symbol is lazy, get the MaterialiaztionUnit for it.
+        if (SymI->second.hasMaterializerAttached()) {
+          assert(SymI->second.getAddress() == 0 &&
+                 "Lazy symbol should not have a resolved address");
+          auto UMII = UnmaterializedInfos.find(Name);
+          assert(UMII != UnmaterializedInfos.end() &&
+                 "Lazy symbol should have UnmaterializedInfo");
+          auto MU = std::move(UMII->second->MU);
+          assert(MU != nullptr && "Materializer should not be null");
+
+          // Kick all symbols associated with this MaterializationUnit into
+          // materializing state.
+          for (auto &KV : MU->getSymbols()) {
+            auto SymK = Symbols.find(KV.first);
+            assert(SymK != Symbols.end() && "Missing symbol table entry");
+            SymK->second.setState(SymbolState::Materializing);
+            SymK->second.setMaterializerAttached(false);
+            UnmaterializedInfos.erase(KV.first);
+          }
 
-    // Add the query to the PendingQueries list.
-    assert(SymI->second.isInMaterializationPhase() &&
-           "By this line the symbol should be materializing");
-    auto &MI = MaterializingInfos[Name];
-    MI.addQuery(Q);
-    Q->addQueryDependence(*this, Name);
-  }
+          // Add MU to the list of MaterializationUnits to be materialized.
+          MUs.push_back(std::move(MU));
+        }
 
-  // Remove any marked symbols from the Unresolved set.
-  for (auto &Name : ToRemove)
-    Unresolved.erase(Name);
+        // Add the query to the PendingQueries list.
+        assert(SymI->second.isInMaterializationPhase() &&
+               "By this line the symbol should be materializing");
+        auto &MI = MaterializingInfos[Name];
+        MI.addQuery(Q);
+        Q->addQueryDependence(*this, Name);
+        return true;
+      });
 
   return QueryComplete;
 }
@@ -1645,11 +1680,7 @@ void JITDylib::dump(raw_ostream &OS) {
   ES.runSessionLocked([&, this]() {
     OS << "JITDylib \"" << JITDylibName << "\" (ES: "
        << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
-       << "Search order: [";
-    for (auto &KV : SearchOrder)
-      OS << " (\"" << KV.first->getName() << "\", "
-         << (KV.second ? "all" : "exported only") << ")";
-    OS << " ]\n"
+       << "Search order: " << SearchOrder << "\n"
        << "Symbol table:\n";
 
     for (auto &KV : Symbols) {
@@ -1730,7 +1761,7 @@ JITDylib::MaterializingInfo::takeQueriesMeeting(SymbolState RequiredState) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back({this, true});
+  SearchOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
@@ -1823,12 +1854,6 @@ void JITDylib::transferEmittedNodeDependencies(
 
 ExecutionSession::ExecutionSession(std::shared_ptr<SymbolStringPool> SSP)
     : SSP(SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>()) {
-  // Construct the main dylib.
-  JDs.push_back(std::unique_ptr<JITDylib>(new JITDylib(*this, "<main>")));
-}
-
-JITDylib &ExecutionSession::getMainJITDylib() {
-  return runSessionLocked([this]() -> JITDylib & { return *JDs.front(); });
 }
 
 JITDylib *ExecutionSession::getJITDylibByName(StringRef Name) {
@@ -1840,14 +1865,11 @@ JITDylib *ExecutionSession::getJITDylibByName(StringRef Name) {
   });
 }
 
-JITDylib &ExecutionSession::createJITDylib(std::string Name,
-                                           bool AddToMainDylibSearchOrder) {
+JITDylib &ExecutionSession::createJITDylib(std::string Name) {
   assert(!getJITDylibByName(Name) && "JITDylib with that name already exists");
   return runSessionLocked([&, this]() -> JITDylib & {
     JDs.push_back(
         std::unique_ptr<JITDylib>(new JITDylib(*this, std::move(Name))));
-    if (AddToMainDylibSearchOrder)
-      JDs.front()->addToSearchOrder(*JDs.back());
     return *JDs.back();
   });
 }
@@ -1898,7 +1920,7 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 #endif
 
   auto Query = std::make_shared<AsynchronousSymbolQuery>(
-      Names, RequiredState, std::move(NotifyComplete));
+      SymbolLookupSet(Names), RequiredState, std::move(NotifyComplete));
   // FIXME: This should be run session locked along with the registration code
   // and error reporting below.
   SymbolNameSet UnresolvedSymbols = AsyncLookup(Query, std::move(Names));
@@ -1935,8 +1957,9 @@ Expected<SymbolMap> ExecutionSession::legacyLookup(
 }
 
 void ExecutionSession::lookup(
-    const JITDylibSearchList &SearchOrder, SymbolNameSet Symbols,
-    SymbolState RequiredState, SymbolsResolvedCallback NotifyComplete,
+    LookupKind K, const JITDylibSearchOrder &SearchOrder,
+    SymbolLookupSet Symbols, SymbolState RequiredState,
+    SymbolsResolvedCallback NotifyComplete,
     RegisterDependenciesFunction RegisterDependencies) {
 
   LLVM_DEBUG({
@@ -1965,14 +1988,24 @@ void ExecutionSession::lookup(
                "JITDylibList should not contain duplicate entries");
 
         auto &JD = *KV.first;
-        auto MatchNonExported = KV.second;
-        if (auto Err = JD.lodgeQuery(Q, Unresolved, MatchNonExported,
-                                     CollectedMUsMap[&JD]))
+        auto JDLookupFlags = KV.second;
+        if (auto Err = JD.lodgeQuery(CollectedMUsMap[&JD], Q, K, JDLookupFlags,
+                                     Unresolved))
           return Err;
       }
 
+      // Strip any weakly referenced symbols that were not found.
+      Unresolved.forEachWithRemoval(
+          [&](const SymbolStringPtr &Name, SymbolLookupFlags Flags) {
+            if (Flags == SymbolLookupFlags::WeaklyReferencedSymbol) {
+              Q->dropSymbol(Name);
+              return true;
+            }
+            return false;
+          });
+
       if (!Unresolved.empty())
-        return make_error<SymbolsNotFound>(std::move(Unresolved));
+        return make_error<SymbolsNotFound>(Unresolved.getSymbolNames());
 
       return Error::success();
     };
@@ -2026,8 +2059,8 @@ void ExecutionSession::lookup(
 }
 
 Expected<SymbolMap>
-ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
-                         const SymbolNameSet &Symbols,
+ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
+                         const SymbolLookupSet &Symbols, LookupKind K,
                          SymbolState RequiredState,
                          RegisterDependenciesFunction RegisterDependencies) {
 #if LLVM_ENABLE_THREADS
@@ -2059,7 +2092,7 @@ ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
 #endif
 
   // Perform the asynchronous lookup.
-  lookup(SearchOrder, Symbols, RequiredState, NotifyComplete,
+  lookup(K, SearchOrder, Symbols, RequiredState, NotifyComplete,
          RegisterDependencies);
 
 #if LLVM_ENABLE_THREADS
@@ -2080,12 +2113,12 @@ ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
 }
 
 Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
+ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
                          SymbolStringPtr Name) {
-  SymbolNameSet Names({Name});
+  SymbolLookupSet Names({Name});
 
-  if (auto ResultMap = lookup(SearchOrder, std::move(Names), SymbolState::Ready,
-                              NoDependenciesToRegister)) {
+  if (auto ResultMap = lookup(SearchOrder, std::move(Names), LookupKind::Static,
+                              SymbolState::Ready, NoDependenciesToRegister)) {
     assert(ResultMap->size() == 1 && "Unexpected number of results");
     assert(ResultMap->count(Name) && "Missing result for symbol");
     return std::move(ResultMap->begin()->second);
@@ -2096,14 +2129,7 @@ ExecutionSession::lookup(const JITDylibSearchList &SearchOrder,
 Expected<JITEvaluatedSymbol>
 ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
                          SymbolStringPtr Name) {
-  SymbolNameSet Names({Name});
-
-  JITDylibSearchList FullSearchOrder;
-  FullSearchOrder.reserve(SearchOrder.size());
-  for (auto *JD : SearchOrder)
-    FullSearchOrder.push_back({JD, false});
-
-  return lookup(FullSearchOrder, Name);
+  return lookup(makeJITDylibSearchOrder(SearchOrder), Name);
 }
 
 Expected<JITEvaluatedSymbol>
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 4a886ac0597c1..0a3fef207ac2f 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -19,6 +19,32 @@
 namespace llvm {
 namespace orc {
 
+int runAsMain(int (*Main)(int, char *[]), ArrayRef<std::string> Args,
+              Optional<StringRef> ProgramName) {
+  std::vector<std::unique_ptr<char[]>> ArgVStorage;
+  std::vector<char *> ArgV;
+
+  ArgVStorage.reserve(Args.size() + (ProgramName ? 1 : 0));
+  ArgV.reserve(Args.size() + 1 + (ProgramName ? 1 : 0));
+
+  if (ProgramName) {
+    ArgVStorage.push_back(std::make_unique<char[]>(ProgramName->size() + 1));
+    llvm::copy(*ProgramName, &ArgVStorage.back()[0]);
+    ArgVStorage.back()[ProgramName->size()] = '\0';
+    ArgV.push_back(ArgVStorage.back().get());
+  }
+
+  for (auto &Arg : Args) {
+    ArgVStorage.push_back(std::make_unique<char[]>(Arg.size() + 1));
+    llvm::copy(Arg, &ArgVStorage.back()[0]);
+    ArgVStorage.back()[Arg.size()] = '\0';
+    ArgV.push_back(ArgVStorage.back().get());
+  }
+  ArgV.push_back(nullptr);
+
+  return Main(Args.size(), ArgV.data());
+}
+
 CtorDtorIterator::CtorDtorIterator(const GlobalVariable *GV, bool End)
   : InitList(
       GV ? dyn_cast_or_null<ConstantArray>(GV->getInitializer()) : nullptr),
@@ -118,19 +144,17 @@ void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
 Error CtorDtorRunner::run() {
   using CtorDtorTy = void (*)();
 
-  SymbolNameSet Names;
-
-  for (auto &KV : CtorDtorsByPriority) {
-    for (auto &Name : KV.second) {
-      auto Added = Names.insert(Name).second;
-      (void)Added;
-      assert(Added && "Ctor/Dtor names clashed");
-    }
-  }
+  SymbolLookupSet LookupSet;
+  for (auto &KV : CtorDtorsByPriority)
+    for (auto &Name : KV.second)
+      LookupSet.add(Name);
+  assert(!LookupSet.containsDuplicates() &&
+         "Ctor/Dtor list contains duplicates");
 
   auto &ES = JD.getExecutionSession();
-  if (auto CtorDtorMap =
-          ES.lookup(JITDylibSearchList({{&JD, true}}), std::move(Names))) {
+  if (auto CtorDtorMap = ES.lookup(
+          makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols),
+          std::move(LookupSet))) {
     for (auto &KV : CtorDtorsByPriority) {
       for (auto &Name : KV.second) {
         assert(CtorDtorMap->count(Name) && "No entry for Name");
@@ -190,15 +214,16 @@ DynamicLibrarySearchGenerator::Load(const char *FileName, char GlobalPrefix,
       std::move(Lib), GlobalPrefix, std::move(Allow));
 }
 
-Expected<SymbolNameSet>
-DynamicLibrarySearchGenerator::tryToGenerate(JITDylib &JD,
-                                             const SymbolNameSet &Names) {
-  orc::SymbolNameSet Added;
+Error DynamicLibrarySearchGenerator::tryToGenerate(
+    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+    const SymbolLookupSet &Symbols) {
   orc::SymbolMap NewSymbols;
 
   bool HasGlobalPrefix = (GlobalPrefix != '\0');
 
-  for (auto &Name : Names) {
+  for (auto &KV : Symbols) {
+    auto &Name = KV.first;
+
     if ((*Name).empty())
       continue;
 
@@ -211,20 +236,16 @@ DynamicLibrarySearchGenerator::tryToGenerate(JITDylib &JD,
     std::string Tmp((*Name).data() + HasGlobalPrefix,
                     (*Name).size() - HasGlobalPrefix);
     if (void *Addr = Dylib.getAddressOfSymbol(Tmp.c_str())) {
-      Added.insert(Name);
       NewSymbols[Name] = JITEvaluatedSymbol(
           static_cast<JITTargetAddress>(reinterpret_cast<uintptr_t>(Addr)),
           JITSymbolFlags::Exported);
     }
   }
 
-  // Add any new symbols to JD. Since the generator is only called for symbols
-  // that are not already defined, this will never trigger a duplicate
-  // definition error, so we can wrap this call in a 'cantFail'.
-  if (!NewSymbols.empty())
-    cantFail(JD.define(absoluteSymbols(std::move(NewSymbols))));
+  if (NewSymbols.empty())
+    return Error::success();
 
-  return Added;
+  return JD.define(absoluteSymbols(std::move(NewSymbols)));
 }
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
@@ -251,15 +272,24 @@ StaticLibraryDefinitionGenerator::Create(
   return std::move(ADG);
 }
 
-Expected<SymbolNameSet>
-StaticLibraryDefinitionGenerator::tryToGenerate(JITDylib &JD,
-                                                const SymbolNameSet &Names) {
+Error StaticLibraryDefinitionGenerator::tryToGenerate(
+    LookupKind K, JITDylib &JD, JITDylibLookupFlags JDLookupFlags,
+    const SymbolLookupSet &Symbols) {
+
+  // Don't materialize symbols from static archives unless this is a static
+  // lookup.
+  if (K != LookupKind::Static)
+    return Error::success();
+
+  // Bail out early if we've already freed the archive.
+  if (!Archive)
+    return Error::success();
 
   DenseSet<std::pair<StringRef, StringRef>> ChildBufferInfos;
-  SymbolNameSet NewDefs;
 
-  for (const auto &Name : Names) {
-    auto Child = Archive.findSym(*Name);
+  for (const auto &KV : Symbols) {
+    const auto &Name = KV.first;
+    auto Child = Archive->findSym(*Name);
     if (!Child)
       return Child.takeError();
     if (*Child == None)
@@ -269,7 +299,6 @@ StaticLibraryDefinitionGenerator::tryToGenerate(JITDylib &JD,
       return ChildBuffer.takeError();
     ChildBufferInfos.insert(
         {ChildBuffer->getBuffer(), ChildBuffer->getBufferIdentifier()});
-    NewDefs.insert(Name);
   }
 
   for (auto ChildBufferInfo : ChildBufferInfos) {
@@ -278,31 +307,16 @@ StaticLibraryDefinitionGenerator::tryToGenerate(JITDylib &JD,
 
     if (auto Err =
             L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef), VModuleKey()))
-      return std::move(Err);
-
-    --UnrealizedObjects;
+      return Err;
   }
 
-  return NewDefs;
+  return Error::success();
 }
 
 StaticLibraryDefinitionGenerator::StaticLibraryDefinitionGenerator(
     ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer, Error &Err)
     : L(L), ArchiveBuffer(std::move(ArchiveBuffer)),
-      Archive(*this->ArchiveBuffer, Err) {
-
-  if (Err)
-    return;
-
-  Error Err2 = Error::success();
-  for (auto _ : Archive.children(Err2)) {
-    (void)_;
-    ++UnrealizedObjects;
-  }
-
-  // No need to check this: We will leave it to the caller.
-  Err = std::move(Err2);
-}
+      Archive(std::make_unique<object::Archive>(*this->ArchiveBuffer, Err)) {}
 
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 0295db7633dd0..440935ffe9fb9 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -101,7 +101,10 @@ JITTargetAddress JITCompileCallbackManager::executeCompileCallback(
       Name = I->second;
   }
 
-  if (auto Sym = ES.lookup(JITDylibSearchList({{&CallbacksJD, true}}), Name))
+  if (auto Sym =
+          ES.lookup(makeJITDylibSearchOrder(
+                        &CallbacksJD, JITDylibLookupFlags::MatchAllSymbols),
+                    Name))
     return Sym->getAddress();
   else {
     llvm::dbgs() << "Didn't find callback.\n";
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 1d3e6db913e21..114e81e41771b 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -28,14 +28,12 @@ Expected<JITTargetMachineBuilder> JITTargetMachineBuilder::detectHost() {
   // Retrieve host CPU name and sub-target features and add them to builder.
   // Relocation model, code model and codegen opt level are kept to default
   // values.
-  llvm::SubtargetFeatures SubtargetFeatures;
   llvm::StringMap<bool> FeatureMap;
   llvm::sys::getHostCPUFeatures(FeatureMap);
   for (auto &Feature : FeatureMap)
-    SubtargetFeatures.AddFeature(Feature.first(), Feature.second);
+    TMBuilder.getFeatures().AddFeature(Feature.first(), Feature.second);
 
   TMBuilder.setCPU(llvm::sys::getHostCPUName());
-  TMBuilder.addFeatures(SubtargetFeatures.getFeatures());
 
   return TMBuilder;
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 03f22e0c2a2a9..89dad6d61b42d 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -56,7 +56,9 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
                                                         StringRef Name) {
-  return ES->lookup(JITDylibSearchList({{&JD, true}}), ES->intern(Name));
+  return ES->lookup(
+      makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols),
+      ES->intern(Name));
 }
 
 std::unique_ptr<ObjectLayer>
@@ -103,7 +105,7 @@ LLJIT::createCompileFunction(LLJITBuilderState &S,
 
 LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
     : ES(S.ES ? std::move(S.ES) : std::make_unique<ExecutionSession>()),
-      Main(this->ES->getMainJITDylib()), DL(""),
+      Main(this->ES->createJITDylib("<main>")), DL(""),
       ObjLinkingLayer(createObjectLinkingLayer(S, *ES)),
       ObjTransformLayer(*this->ES, *ObjLinkingLayer), CtorRunner(Main),
       DtorRunner(Main) {
diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
index 93aabd817d601..aab490feb8ea2 100644
--- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -50,8 +50,10 @@ LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
     SourceJD = I->second.first;
     SymbolName = I->second.second;
   }
-  auto LookupResult =
-      ES.lookup(JITDylibSearchList({{SourceJD, true}}), SymbolName);
+
+  auto LookupResult = ES.lookup(
+      makeJITDylibSearchOrder(SourceJD, JITDylibLookupFlags::MatchAllSymbols),
+      SymbolName);
 
   if (!LookupResult) {
     ES.reportError(LookupResult.takeError());
diff --git a/llvm/lib/ExecutionEngine/Orc/Legacy.cpp b/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
index 9f9a6730b2c30..67b804c37287d 100644
--- a/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Legacy.cpp
@@ -37,7 +37,8 @@ void JITSymbolResolverAdapter::lookup(const LookupSet &Symbols,
   };
 
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
-      InternedSymbols, SymbolState::Resolved, std::move(OnResolvedWithUnwrap));
+      SymbolLookupSet(InternedSymbols), SymbolState::Resolved,
+      std::move(OnResolvedWithUnwrap));
 
   auto Unresolved = R.lookup(Q, InternedSymbols);
   if (Unresolved.empty()) {
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 874decb2ade0b..be0ce4a1d75a0 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -47,18 +47,28 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
     MR.failMaterialization();
   }
 
-  void lookup(const DenseSet<StringRef> &Symbols,
+  void lookup(const LookupMap &Symbols,
               std::unique_ptr<JITLinkAsyncLookupContinuation> LC) override {
 
-    JITDylibSearchList SearchOrder;
+    JITDylibSearchOrder SearchOrder;
     MR.getTargetJITDylib().withSearchOrderDo(
-        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
+        [&](const JITDylibSearchOrder &O) { SearchOrder = O; });
 
     auto &ES = Layer.getExecutionSession();
 
-    SymbolNameSet InternedSymbols;
-    for (auto &S : Symbols)
-      InternedSymbols.insert(ES.intern(S));
+    SymbolLookupSet LookupSet;
+    for (auto &KV : Symbols) {
+      orc::SymbolLookupFlags LookupFlags;
+      switch (KV.second) {
+      case jitlink::SymbolLookupFlags::RequiredSymbol:
+        LookupFlags = orc::SymbolLookupFlags::RequiredSymbol;
+        break;
+      case jitlink::SymbolLookupFlags::WeaklyReferencedSymbol:
+        LookupFlags = orc::SymbolLookupFlags::WeaklyReferencedSymbol;
+        break;
+      }
+      LookupSet.add(ES.intern(KV.first), LookupFlags);
+    }
 
     // OnResolve -- De-intern the symbols and pass the result to the linker.
     auto OnResolve = [this, LookupContinuation = std::move(LC)](
@@ -74,8 +84,9 @@ class ObjectLinkingLayerJITLinkContext final : public JITLinkContext {
       }
     };
 
-    ES.lookup(SearchOrder, std::move(InternedSymbols), SymbolState::Resolved,
-              std::move(OnResolve), [this](const SymbolDependenceMap &Deps) {
+    ES.lookup(LookupKind::Static, SearchOrder, std::move(LookupSet),
+              SymbolState::Resolved, std::move(OnResolve),
+              [this](const SymbolDependenceMap &Deps) {
                 registerDependencies(Deps);
               });
   }
diff --git a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index 939cd539d1fb0..3344bd4d53f98 100644
--- a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -19,11 +19,11 @@ class JITDylibSearchOrderResolver : public JITSymbolResolver {
 
   void lookup(const LookupSet &Symbols, OnResolvedFunction OnResolved) {
     auto &ES = MR.getTargetJITDylib().getExecutionSession();
-    SymbolNameSet InternedSymbols;
+    SymbolLookupSet InternedSymbols;
 
     // Intern the requested symbols: lookup takes interned strings.
     for (auto &S : Symbols)
-      InternedSymbols.insert(ES.intern(S));
+      InternedSymbols.add(ES.intern(S));
 
     // Build an OnResolve callback to unwrap the interned strings and pass them
     // to the OnResolved callback.
@@ -46,11 +46,12 @@ class JITDylibSearchOrderResolver : public JITSymbolResolver {
       MR.addDependenciesForAll(Deps);
     };
 
-    JITDylibSearchList SearchOrder;
+    JITDylibSearchOrder SearchOrder;
     MR.getTargetJITDylib().withSearchOrderDo(
-        [&](const JITDylibSearchList &JDs) { SearchOrder = JDs; });
-    ES.lookup(SearchOrder, InternedSymbols, SymbolState::Resolved,
-              std::move(OnResolvedWithUnwrap), RegisterDependencies);
+        [&](const JITDylibSearchOrder &JDs) { SearchOrder = JDs; });
+    ES.lookup(LookupKind::Static, SearchOrder, InternedSymbols,
+              SymbolState::Resolved, std::move(OnResolvedWithUnwrap),
+              RegisterDependencies);
   }
 
   Expected<LookupSet> getResponsibilitySet(const LookupSet &Symbols) {
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 01989b97f7fa0..f9d4b181f862b 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3400,9 +3400,6 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  // Print out the return type and name.
-  Out << '\n';
-
   if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
 
   if (F->isMaterializable())
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d2dd2a69beab2..5aaf90df6f6e3 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -559,6 +559,26 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
       NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer);
       return true;
     }
+    if (Name.startswith("arm.neon.vqadds.")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sadd_sat,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vqaddu.")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::uadd_sat,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vqsubs.")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ssub_sat,
+                                        F->arg_begin()->getType());
+      return true;
+    }
+    if (Name.startswith("arm.neon.vqsubu.")) {
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::usub_sat,
+                                        F->arg_begin()->getType());
+      return true;
+    }
     if (Name.startswith("aarch64.neon.addp")) {
       if (F->arg_size() != 2)
         break; // Invalid IR.
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 68c3c7ad90dab..2a8ea0657dbb6 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -64,11 +64,11 @@ ConstantRange ConstantRange::fromKnownBits(const KnownBits &Known,
   // For unsigned ranges, or signed ranges with known sign bit, create a simple
   // range between the smallest and largest possible value.
   if (!IsSigned || Known.isNegative() || Known.isNonNegative())
-    return ConstantRange(Known.One, ~Known.Zero + 1);
+    return ConstantRange(Known.getMinValue(), Known.getMaxValue() + 1);
 
   // If we don't know the sign bit, pick the lower bound as a negative number
   // and the upper bound as a non-negative one.
-  APInt Lower = Known.One, Upper = ~Known.Zero;
+  APInt Lower = Known.getMinValue(), Upper = Known.getMaxValue();
   Lower.setSignBit();
   Upper.clearSignBit();
   return ConstantRange(Lower, Upper + 1);
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 59b4a5ef8a186..bdd9f6baf3791 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -306,10 +306,11 @@ DIDerivedType *DIBuilder::createReferenceType(
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIFile *File, unsigned LineNo,
-                                        DIScope *Context) {
+                                        DIScope *Context,
+                                        uint32_t AlignInBits) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
-                            LineNo, getNonCompileUnitScope(Context), Ty, 0, 0,
-                            0, None, DINode::FlagZero);
+                            LineNo, getNonCompileUnitScope(Context), Ty, 0,
+                            AlignInBits, 0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 1bbe6b85d2600..62bfeb5c5d77a 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1108,11 +1108,10 @@ LLVMMetadataRef
 LLVMDIBuilderCreateTypedef(LLVMDIBuilderRef Builder, LLVMMetadataRef Type,
                            const char *Name, size_t NameLen,
                            LLVMMetadataRef File, unsigned LineNo,
-                           LLVMMetadataRef Scope) {
+                           LLVMMetadataRef Scope, uint32_t AlignInBits) {
   return wrap(unwrap(Builder)->createTypedef(
-                  unwrapDI<DIType>(Type), {Name, NameLen},
-                  unwrapDI<DIFile>(File), LineNo,
-                  unwrapDI<DIScope>(Scope)));
+      unwrapDI<DIType>(Type), {Name, NameLen}, unwrapDI<DIFile>(File), LineNo,
+      unwrapDI<DIScope>(Scope), AlignInBits));
 }
 
 LLVMMetadataRef
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index e4036ee1eb0c9..9b42a5a0e1b5b 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -23,6 +23,9 @@
 
 using namespace llvm;
 
+const DIExpression::FragmentInfo DebugVariable::DefaultFragment = {
+    std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::min()};
+
 DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line,
                        unsigned Column, ArrayRef<Metadata *> MDs,
                        bool ImplicitCode)
@@ -1148,10 +1151,14 @@ Optional<DIExpression *> DIExpression::createFragmentExpression(
     for (auto Op : Expr->expr_ops()) {
       switch (Op.getOp()) {
       default: break;
+      case dwarf::DW_OP_shr:
+      case dwarf::DW_OP_shra:
+      case dwarf::DW_OP_shl:
       case dwarf::DW_OP_plus:
+      case dwarf::DW_OP_plus_uconst:
       case dwarf::DW_OP_minus:
-        // We can't safely split arithmetic into multiple fragments because we
-        // can't express carry-over between fragments.
+        // We can't safely split arithmetic or shift operations into multiple
+        // fragments because we can't express carry-over between fragments.
         //
         // FIXME: We *could* preserve the lowest fragment of a constant offset
         // operation if the offset fits into SizeInBits.
diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp
index 8fa97a3aecb73..03657ff8d9d43 100644
--- a/llvm/lib/IR/IRPrintingPasses.cpp
+++ b/llvm/lib/IR/IRPrintingPasses.cpp
@@ -57,7 +57,7 @@ PreservedAnalyses PrintFunctionPass::run(Function &F,
     if (forcePrintModuleIR())
       OS << Banner << " (function: " << F.getName() << ")\n" << *F.getParent();
     else
-      OS << Banner << static_cast<Value &>(F);
+      OS << Banner << '\n' << static_cast<Value &>(F);
   }
   return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 8fe59912f20ac..90239bb762989 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -1776,58 +1776,42 @@ LLVM_DUMP_METHOD void PMStack::dump() const {
 void ModulePass::assignPassManager(PMStack &PMS,
                                    PassManagerType PreferredType) {
   // Find Module Pass Manager
-  while (!PMS.empty()) {
-    PassManagerType TopPMType = PMS.top()->getPassManagerType();
-    if (TopPMType == PreferredType)
-      break; // We found desired pass manager
-    else if (TopPMType > PMT_ModulePassManager)
-      PMS.pop();    // Pop children pass managers
-    else
-      break;
-  }
-  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PassManagerType T;
+  while ((T = PMS.top()->getPassManagerType()) > PMT_ModulePassManager &&
+         T != PreferredType)
+    PMS.pop();
   PMS.top()->add(this);
 }
 
 /// Find appropriate Function Pass Manager or Call Graph Pass Manager
 /// in the PM Stack and add self into that manager.
 void FunctionPass::assignPassManager(PMStack &PMS,
-                                     PassManagerType PreferredType) {
-
+                                     PassManagerType /*PreferredType*/) {
   // Find Function Pass Manager
-  while (!PMS.empty()) {
-    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
-      PMS.pop();
-    else
-      break;
-  }
+  PMDataManager *PM;
+  while (PM = PMS.top(), PM->getPassManagerType() > PMT_FunctionPassManager)
+    PMS.pop();
 
   // Create new Function Pass Manager if needed.
-  FPPassManager *FPP;
-  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
-    FPP = (FPPassManager *)PMS.top();
-  } else {
-    assert(!PMS.empty() && "Unable to create Function Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
+  if (PM->getPassManagerType() != PMT_FunctionPassManager) {
     // [1] Create new Function Pass Manager
-    FPP = new FPPassManager();
+    auto *FPP = new FPPassManager;
     FPP->populateInheritedAnalysis(PMS);
 
     // [2] Set up new manager's top level manager
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(FPP);
+    PM->getTopLevelManager()->addIndirectPassManager(FPP);
 
     // [3] Assign manager to manage this new manager. This may create
     // and push new managers into PMS
-    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+    FPP->assignPassManager(PMS, PM->getPassManagerType());
 
     // [4] Push new manager into PMS
     PMS.push(FPP);
+    PM = FPP;
   }
 
   // Assign FPP as the manager of this pass.
-  FPP->add(this);
+  PM->add(this);
 }
 
 PassManagerBase::~PassManagerBase() {}
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index ebe22c37c707b..0d48090e4268b 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1381,8 +1381,12 @@ lto::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
                               StringRef RemarksPasses, StringRef RemarksFormat,
                               bool RemarksWithHotness, int Count) {
   std::string Filename = RemarksFilename;
+  // For ThinLTO, file.opt.<format> becomes
+  // file.opt.<format>.thin.<num>.<format>.
   if (!Filename.empty() && Count != -1)
-    Filename += ".thin." + llvm::utostr(Count) + ".yaml";
+    Filename =
+        (Twine(Filename) + ".thin." + llvm::utostr(Count) + "." + RemarksFormat)
+            .str();
 
   auto ResultOrErr = llvm::setupOptimizationRemarks(
       Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness);
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index bcc7c45afc01b..b4b3c9956cc2d 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -1701,7 +1701,8 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
         MakeStartMinusEndExpr(Streamer, SectionStart, cieStart, 0);
     emitAbsValue(Streamer, offset, 4);
   } else {
-    Streamer.EmitSymbolValue(&cieStart, 4);
+    Streamer.EmitSymbolValue(&cieStart, 4,
+                             asmInfo->needsDwarfSectionOffsetDirective());
   }
 
   // PC Begin
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index 5b4da1998c414..9aee0a5ca4e5e 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -463,6 +463,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
                                              DebugSecType, ELF::SHF_EXCLUDE);
   DwarfRnglistsDWOSection =
       Ctx->getELFSection(".debug_rnglists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
+  DwarfMacinfoDWOSection =
+      Ctx->getELFSection(".debug_macinfo.dwo", DebugSecType, ELF::SHF_EXCLUDE);
 
   DwarfLoclistsDWOSection =
       Ctx->getELFSection(".debug_loclists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index f646168d3a4a7..8377e295532ae 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -40,6 +40,8 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     case XCOFF::XMC_DS:
       OS << "\t.csect " << QualName->getName() << '\n';
       break;
+    case XCOFF::XMC_TC:
+      break;
     case XCOFF::XMC_TC0:
       OS << "\t.toc\n";
       break;
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index c40a067e93e17..6efa167ced42a 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -50,12 +50,6 @@ void MCXCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                       XCOFF::C_HIDEXT);
   Symbol->setCommon(Size, ByteAlignment);
 
-  // Need to add this symbol to the current Fragment which will belong to the
-  // containing CSECT.
-  auto *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  assert(F && "Expected a valid section with a fragment set.");
-  Symbol->setFragment(F);
-
   // Emit the alignment and storage for the variable to the section.
   EmitValueToAlignment(ByteAlignment);
   EmitZeros(Size);
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index ca96a0ecf9ff5..773ca3a0909de 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -275,6 +275,12 @@ CsectGroup &XCOFFObjectWriter::getCsectGroup(const MCSectionXCOFF *MCSec) {
            "We should have only one TOC-base, and it should be the first csect "
            "in this CsectGroup.");
     return TOCCsects;
+  case XCOFF::XMC_TC:
+    assert(XCOFF::XTY_SD == MCSec->getCSectType() &&
+           "Only an initialized csect can contain TC entry.");
+    assert(!TOCCsects.empty() &&
+           "We should at least have a TOC-base in this CsectGroup.");
+    return TOCCsects;
   default:
     report_fatal_error("Unhandled mapping of csect to section.");
   }
@@ -574,7 +580,7 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
   // yet, so start at index 0.
   uint32_t SymbolTableIndex = 0;
 
-  // Calculate undefined symbol's indices.
+  // Calculate indices for undefined symbols.
   for (auto &Csect : UndefinedCsects) {
     Csect.Size = 0;
     Csect.Address = 0;
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index e8b54a7e60200..d2e6fdfea009a 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -248,7 +248,7 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
     ImplicitSections.push_back(".symtab");
   ImplicitSections.insert(ImplicitSections.end(), {".strtab", ".shstrtab"});
 
-  if (!Doc.DynamicSymbols.empty())
+  if (Doc.DynamicSymbols)
     ImplicitSections.insert(ImplicitSections.end(), {".dynsym", ".dynstr"});
 
   // Insert placeholders for implicit sections that are not
@@ -562,21 +562,24 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   ArrayRef<ELFYAML::Symbol> Symbols;
   if (IsStatic && Doc.Symbols)
     Symbols = *Doc.Symbols;
-  else if (!IsStatic)
-    Symbols = Doc.DynamicSymbols;
+  else if (!IsStatic && Doc.DynamicSymbols)
+    Symbols = *Doc.DynamicSymbols;
 
   ELFYAML::RawContentSection *RawSec =
       dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
-  if (RawSec && !Symbols.empty() && (RawSec->Content || RawSec->Size)) {
-    if (RawSec->Content)
-      reportError("cannot specify both `Content` and " +
-                  (IsStatic ? Twine("`Symbols`") : Twine("`DynamicSymbols`")) +
-                  " for symbol table section '" + RawSec->Name + "'");
-    if (RawSec->Size)
-      reportError("cannot specify both `Size` and " +
-                  (IsStatic ? Twine("`Symbols`") : Twine("`DynamicSymbols`")) +
-                  " for symbol table section '" + RawSec->Name + "'");
-    return;
+  if (RawSec && (RawSec->Content || RawSec->Size)) {
+    bool HasSymbolsDescription =
+        (IsStatic && Doc.Symbols) || (!IsStatic && Doc.DynamicSymbols);
+    if (HasSymbolsDescription) {
+      StringRef Property = (IsStatic ? "`Symbols`" : "`DynamicSymbols`");
+      if (RawSec->Content)
+        reportError("cannot specify both `Content` and " + Property +
+                    " for symbol table section '" + RawSec->Name + "'");
+      if (RawSec->Size)
+        reportError("cannot specify both `Size` and " + Property +
+                    " for symbol table section '" + RawSec->Name + "'");
+      return;
+    }
   }
 
   zero(SHeader);
@@ -985,9 +988,19 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   raw_ostream &OS =
       CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
 
+  SHeader.sh_info = Section.Info;
+
+  if (Section.Content) {
+    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    return;
+  }
+
+  if (!Section.Entries)
+    return;
+
   uint64_t AuxCnt = 0;
-  for (size_t I = 0; I < Section.Entries.size(); ++I) {
-    const ELFYAML::VerdefEntry &E = Section.Entries[I];
+  for (size_t I = 0; I < Section.Entries->size(); ++I) {
+    const ELFYAML::VerdefEntry &E = (*Section.Entries)[I];
 
     Elf_Verdef VerDef;
     VerDef.vd_version = E.Version;
@@ -996,7 +1009,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     VerDef.vd_hash = E.Hash;
     VerDef.vd_aux = sizeof(Elf_Verdef);
     VerDef.vd_cnt = E.VerNames.size();
-    if (I == Section.Entries.size() - 1)
+    if (I == Section.Entries->size() - 1)
       VerDef.vd_next = 0;
     else
       VerDef.vd_next =
@@ -1014,9 +1027,8 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     }
   }
 
-  SHeader.sh_size = Section.Entries.size() * sizeof(Elf_Verdef) +
+  SHeader.sh_size = Section.Entries->size() * sizeof(Elf_Verdef) +
                     AuxCnt * sizeof(Elf_Verdaux);
-  SHeader.sh_info = Section.Info;
 }
 
 template <class ELFT>
@@ -1027,15 +1039,24 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   typedef typename ELFT::Vernaux Elf_Vernaux;
 
   auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  SHeader.sh_info = Section.Info;
+
+  if (Section.Content) {
+    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    return;
+  }
+
+  if (!Section.VerneedV)
+    return;
 
   uint64_t AuxCnt = 0;
-  for (size_t I = 0; I < Section.VerneedV.size(); ++I) {
-    const ELFYAML::VerneedEntry &VE = Section.VerneedV[I];
+  for (size_t I = 0; I < Section.VerneedV->size(); ++I) {
+    const ELFYAML::VerneedEntry &VE = (*Section.VerneedV)[I];
 
     Elf_Verneed VerNeed;
     VerNeed.vn_version = VE.Version;
     VerNeed.vn_file = DotDynstr.getOffset(VE.File);
-    if (I == Section.VerneedV.size() - 1)
+    if (I == Section.VerneedV->size() - 1)
       VerNeed.vn_next = 0;
     else
       VerNeed.vn_next =
@@ -1060,9 +1081,8 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     }
   }
 
-  SHeader.sh_size = Section.VerneedV.size() * sizeof(Elf_Verneed) +
+  SHeader.sh_size = Section.VerneedV->size() * sizeof(Elf_Verneed) +
                     AuxCnt * sizeof(Elf_Vernaux);
-  SHeader.sh_info = Section.Info;
 }
 
 template <class ELFT>
@@ -1317,7 +1337,8 @@ template <class ELFT> void ELFState<ELFT>::buildSymbolIndexes() {
 
   if (Doc.Symbols)
     Build(*Doc.Symbols, SymN2I);
-  Build(Doc.DynamicSymbols, DynSymN2I);
+  if (Doc.DynamicSymbols)
+    Build(*Doc.DynamicSymbols, DynSymN2I);
 }
 
 template <class ELFT> void ELFState<ELFT>::finalizeStrings() {
@@ -1328,22 +1349,26 @@ template <class ELFT> void ELFState<ELFT>::finalizeStrings() {
   DotStrtab.finalize();
 
   // Add the dynamic symbol names to .dynstr section.
-  for (const ELFYAML::Symbol &Sym : Doc.DynamicSymbols)
-    DotDynstr.add(ELFYAML::dropUniqueSuffix(Sym.Name));
+  if (Doc.DynamicSymbols)
+    for (const ELFYAML::Symbol &Sym : *Doc.DynamicSymbols)
+      DotDynstr.add(ELFYAML::dropUniqueSuffix(Sym.Name));
 
   // SHT_GNU_verdef and SHT_GNU_verneed sections might also
   // add strings to .dynstr section.
   for (const ELFYAML::Chunk *Sec : Doc.getSections()) {
     if (auto VerNeed = dyn_cast<ELFYAML::VerneedSection>(Sec)) {
-      for (const ELFYAML::VerneedEntry &VE : VerNeed->VerneedV) {
-        DotDynstr.add(VE.File);
-        for (const ELFYAML::VernauxEntry &Aux : VE.AuxV)
-          DotDynstr.add(Aux.Name);
+      if (VerNeed->VerneedV) {
+        for (const ELFYAML::VerneedEntry &VE : *VerNeed->VerneedV) {
+          DotDynstr.add(VE.File);
+          for (const ELFYAML::VernauxEntry &Aux : VE.AuxV)
+            DotDynstr.add(Aux.Name);
+        }
       }
     } else if (auto VerDef = dyn_cast<ELFYAML::VerdefSection>(Sec)) {
-      for (const ELFYAML::VerdefEntry &E : VerDef->Entries)
-        for (StringRef Name : E.VerNames)
-          DotDynstr.add(Name);
+      if (VerDef->Entries)
+        for (const ELFYAML::VerdefEntry &E : *VerDef->Entries)
+          for (StringRef Name : E.VerNames)
+            DotDynstr.add(Name);
     }
   }
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index a5e5894af04d4..c8de7a662fc18 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -54,6 +54,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_PT>::enumeration(
   ECase(PT_GNU_EH_FRAME);
   ECase(PT_GNU_STACK);
   ECase(PT_GNU_RELRO);
+  ECase(PT_GNU_PROPERTY);
 #undef ECase
   IO.enumFallback<Hex32>(Value);
 }
@@ -1074,7 +1075,8 @@ static void sectionMapping(IO &IO, ELFYAML::NoBitsSection &Section) {
 static void sectionMapping(IO &IO, ELFYAML::VerdefSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Info", Section.Info);
-  IO.mapRequired("Entries", Section.Entries);
+  IO.mapOptional("Entries", Section.Entries);
+  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::SymverSection &Section) {
@@ -1085,7 +1087,8 @@ static void sectionMapping(IO &IO, ELFYAML::SymverSection &Section) {
 static void sectionMapping(IO &IO, ELFYAML::VerneedSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapRequired("Info", Section.Info);
-  IO.mapRequired("Dependencies", Section.VerneedV);
+  IO.mapOptional("Dependencies", Section.VerneedV);
+  IO.mapOptional("Content", Section.Content);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::RelocationSection &Section) {
@@ -1419,6 +1422,20 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::validate(
     return {};
   }
 
+  if (const auto *VD = dyn_cast<ELFYAML::VerdefSection>(C.get())) {
+    if (VD->Entries && VD->Content)
+      return "SHT_GNU_verdef: \"Entries\" and \"Content\" can't be used "
+             "together";
+    return {};
+  }
+
+  if (const auto *VD = dyn_cast<ELFYAML::VerneedSection>(C.get())) {
+    if (VD->VerneedV && VD->Content)
+      return "SHT_GNU_verneed: \"Dependencies\" and \"Content\" can't be used "
+             "together";
+    return {};
+  }
+
   return {};
 }
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d988506b5e980..8b583bde5909c 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -240,6 +240,7 @@ FUNCTION_PASS("verify<loops>", LoopVerifierPass())
 FUNCTION_PASS("verify<memoryssa>", MemorySSAVerifierPass())
 FUNCTION_PASS("verify<regions>", RegionInfoVerifierPass())
 FUNCTION_PASS("verify<safepoint-ir>", SafepointIRVerifierPass())
+FUNCTION_PASS("verify<scalar-evolution>", ScalarEvolutionVerifierPass())
 FUNCTION_PASS("view-cfg", CFGViewerPass())
 FUNCTION_PASS("view-cfg-only", CFGOnlyViewerPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 92e0f5b221048..1f424075d47fd 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -24,11 +24,13 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -37,9 +39,11 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
 #include <map>
+#include <string>
 using namespace llvm;
 using namespace cl;
 
@@ -1043,14 +1047,16 @@ static bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
   return (S.size() >= 3 && S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
 }
 
-static bool ExpandResponseFile(StringRef FName, StringSaver &Saver,
-                               TokenizerCallback Tokenizer,
-                               SmallVectorImpl<const char *> &NewArgv,
-                               bool MarkEOLs, bool RelativeNames) {
-  ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
-      MemoryBuffer::getFile(FName);
+// FName must be an absolute path.
+static llvm::Error ExpandResponseFile(
+    StringRef FName, StringSaver &Saver, TokenizerCallback Tokenizer,
+    SmallVectorImpl<const char *> &NewArgv, bool MarkEOLs, bool RelativeNames,
+    llvm::vfs::FileSystem &FS) {
+  assert(sys::path::is_absolute(FName));
+  llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
+      FS.getBufferForFile(FName);
   if (!MemBufOrErr)
-    return false;
+    return llvm::errorCodeToError(MemBufOrErr.getError());
   MemoryBuffer &MemBuf = *MemBufOrErr.get();
   StringRef Str(MemBuf.getBufferStart(), MemBuf.getBufferSize());
 
@@ -1059,7 +1065,8 @@ static bool ExpandResponseFile(StringRef FName, StringSaver &Saver,
   std::string UTF8Buf;
   if (hasUTF16ByteOrderMark(BufRef)) {
     if (!convertUTF16ToUTF8String(BufRef, UTF8Buf))
-      return false;
+      return llvm::createStringError(std::errc::illegal_byte_sequence,
+                                     "Could not convert UTF16 to UTF8");
     Str = StringRef(UTF8Buf);
   }
   // If we see UTF-8 BOM sequence at the beginning of a file, we shall remove
@@ -1071,41 +1078,40 @@ static bool ExpandResponseFile(StringRef FName, StringSaver &Saver,
   // Tokenize the contents into NewArgv.
   Tokenizer(Str, Saver, NewArgv, MarkEOLs);
 
+  if (!RelativeNames)
+    return Error::success();
+  llvm::StringRef BasePath = llvm::sys::path::parent_path(FName);
   // If names of nested response files should be resolved relative to including
   // file, replace the included response file names with their full paths
   // obtained by required resolution.
-  if (RelativeNames)
-    for (unsigned I = 0; I < NewArgv.size(); ++I)
-      if (NewArgv[I]) {
-        StringRef Arg = NewArgv[I];
-        if (Arg.front() == '@') {
-          StringRef FileName = Arg.drop_front();
-          if (llvm::sys::path::is_relative(FileName)) {
-            SmallString<128> ResponseFile;
-            ResponseFile.append(1, '@');
-            if (llvm::sys::path::is_relative(FName)) {
-              SmallString<128> curr_dir;
-              llvm::sys::fs::current_path(curr_dir);
-              ResponseFile.append(curr_dir.str());
-            }
-            llvm::sys::path::append(
-                ResponseFile, llvm::sys::path::parent_path(FName), FileName);
-            NewArgv[I] = Saver.save(ResponseFile.c_str()).data();
-          }
-        }
-      }
+  for (auto &Arg : NewArgv) {
+    // Skip non-rsp file arguments.
+    if (!Arg || Arg[0] != '@')
+      continue;
 
-  return true;
+    StringRef FileName(Arg + 1);
+    // Skip if non-relative.
+    if (!llvm::sys::path::is_relative(FileName))
+      continue;
+
+    SmallString<128> ResponseFile;
+    ResponseFile.push_back('@');
+    ResponseFile.append(BasePath);
+    llvm::sys::path::append(ResponseFile, FileName);
+    Arg = Saver.save(ResponseFile.c_str()).data();
+  }
+  return Error::success();
 }
 
 /// Expand response files on a command line recursively using the given
 /// StringSaver and tokenization strategy.
 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
-                             SmallVectorImpl<const char *> &Argv,
-                             bool MarkEOLs, bool RelativeNames) {
+                             SmallVectorImpl<const char *> &Argv, bool MarkEOLs,
+                             bool RelativeNames, llvm::vfs::FileSystem &FS,
+                             llvm::Optional<llvm::StringRef> CurrentDir) {
   bool AllExpanded = true;
   struct ResponseFileRecord {
-    const char *File;
+    std::string File;
     size_t End;
   };
 
@@ -1139,8 +1145,31 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
     }
 
     const char *FName = Arg + 1;
-    auto IsEquivalent = [FName](const ResponseFileRecord &RFile) {
-      return sys::fs::equivalent(RFile.File, FName);
+    // Note that CurrentDir is only used for top-level rsp files, the rest will
+    // always have an absolute path deduced from the containing file.
+    SmallString<128> CurrDir;
+    if (llvm::sys::path::is_relative(FName)) {
+      if (!CurrentDir)
+        llvm::sys::fs::current_path(CurrDir);
+      else
+        CurrDir = *CurrentDir;
+      llvm::sys::path::append(CurrDir, FName);
+      FName = CurrDir.c_str();
+    }
+    auto IsEquivalent = [FName, &FS](const ResponseFileRecord &RFile) {
+      llvm::ErrorOr<llvm::vfs::Status> LHS = FS.status(FName);
+      if (!LHS) {
+        // TODO: The error should be propagated up the stack.
+        llvm::consumeError(llvm::errorCodeToError(LHS.getError()));
+        return false;
+      }
+      llvm::ErrorOr<llvm::vfs::Status> RHS = FS.status(RFile.File);
+      if (!RHS) {
+        // TODO: The error should be propagated up the stack.
+        llvm::consumeError(llvm::errorCodeToError(RHS.getError()));
+        return false;
+      }
+      return LHS->equivalent(*RHS);
     };
 
     // Check for recursive response files.
@@ -1155,10 +1184,13 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
     // Replace this response file argument with the tokenization of its
     // contents.  Nested response files are expanded in subsequent iterations.
     SmallVector<const char *, 0> ExpandedArgv;
-    if (!ExpandResponseFile(FName, Saver, Tokenizer, ExpandedArgv, MarkEOLs,
-                            RelativeNames)) {
+    if (llvm::Error Err =
+            ExpandResponseFile(FName, Saver, Tokenizer, ExpandedArgv, MarkEOLs,
+                               RelativeNames, FS)) {
       // We couldn't read this file, so we leave it in the argument stream and
       // move on.
+      // TODO: The error should be propagated up the stack.
+      llvm::consumeError(std::move(Err));
       AllExpanded = false;
       ++I;
       continue;
@@ -1186,9 +1218,20 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
 
 bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver,
                         SmallVectorImpl<const char *> &Argv) {
-  if (!ExpandResponseFile(CfgFile, Saver, cl::tokenizeConfigFile, Argv,
-                          /*MarkEOLs*/ false, /*RelativeNames*/ true))
+  SmallString<128> AbsPath;
+  if (sys::path::is_relative(CfgFile)) {
+    llvm::sys::fs::current_path(AbsPath);
+    llvm::sys::path::append(AbsPath, CfgFile);
+    CfgFile = AbsPath.str();
+  }
+  if (llvm::Error Err =
+          ExpandResponseFile(CfgFile, Saver, cl::tokenizeConfigFile, Argv,
+                             /*MarkEOLs*/ false, /*RelativeNames*/ true,
+                             *llvm::vfs::getRealFileSystem())) {
+    // TODO: The error should be propagated up the stack.
+    llvm::consumeError(std::move(Err));
     return false;
+  }
   return ExpandResponseFiles(Saver, cl::tokenizeConfigFile, Argv,
                              /*MarkEOLs*/ false, /*RelativeNames*/ true);
 }
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 7e07b8f7ca264..ef38c1c09413a 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -140,6 +140,9 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8E", "pwr8")
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
+      // FIXME: If we get a simulator or machine with the capabilities of
+      // mcpu=future, we should revisit this and add the name reported by the
+      // simulator/machine.
       .Default(generic);
 }
 
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index a6c591fca3121..8f3f4aa8caeaf 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -21,8 +21,8 @@ static KnownBits computeForAddCarry(
   assert(!(CarryZero && CarryOne) &&
          "Carry can't be zero and one at the same time");
 
-  APInt PossibleSumZero = ~LHS.Zero + ~RHS.Zero + !CarryZero;
-  APInt PossibleSumOne = LHS.One + RHS.One + CarryOne;
+  APInt PossibleSumZero = LHS.getMaxValue() + RHS.getMaxValue() + !CarryZero;
+  APInt PossibleSumOne = LHS.getMinValue() + RHS.getMinValue() + CarryOne;
 
   // Compute known bits of the carry.
   APInt CarryKnownZero = ~(PossibleSumZero ^ LHS.Zero ^ RHS.Zero);
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 14def83802daf..3c9a08cb4077d 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -496,27 +496,50 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
   path.append(ext.begin(), ext.end());
 }
 
-void replace_path_prefix(SmallVectorImpl<char> &Path,
+bool replace_path_prefix(SmallVectorImpl<char> &Path,
                          const StringRef &OldPrefix, const StringRef &NewPrefix,
-                         Style style) {
+                         Style style, bool strict) {
   if (OldPrefix.empty() && NewPrefix.empty())
-    return;
+    return false;
 
   StringRef OrigPath(Path.begin(), Path.size());
-  if (!OrigPath.startswith(OldPrefix))
-    return;
+  StringRef OldPrefixDir;
+
+  if (!strict && OldPrefix.size() > OrigPath.size())
+    return false;
+
+  // Ensure OldPrefixDir does not have a trailing separator.
+  if (!OldPrefix.empty() && is_separator(OldPrefix.back()))
+    OldPrefixDir = parent_path(OldPrefix, style);
+  else
+    OldPrefixDir = OldPrefix;
+
+  if (!OrigPath.startswith(OldPrefixDir))
+    return false;
+
+  if (OrigPath.size() > OldPrefixDir.size())
+    if (!is_separator(OrigPath[OldPrefixDir.size()], style) && strict)
+      return false;
 
   // If prefixes have the same size we can simply copy the new one over.
-  if (OldPrefix.size() == NewPrefix.size()) {
+  if (OldPrefixDir.size() == NewPrefix.size() && !strict) {
     llvm::copy(NewPrefix, Path.begin());
-    return;
+    return true;
   }
 
-  StringRef RelPath = OrigPath.substr(OldPrefix.size());
+  StringRef RelPath = OrigPath.substr(OldPrefixDir.size());
   SmallString<256> NewPath;
   path::append(NewPath, style, NewPrefix);
-  path::append(NewPath, style, RelPath);
+  if (!RelPath.empty()) {
+    if (!is_separator(RelPath[0], style) || !strict)
+      path::append(NewPath, style, RelPath);
+    else
+      path::append(NewPath, style, relative_path(RelPath, style));
+  }
+
   Path.swap(NewPath);
+
+  return true;
 }
 
 void native(const Twine &path, SmallVectorImpl<char> &result, Style style) {
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index c3d742388aafa..6c993387e59d8 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -13,8 +13,8 @@
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/Path.h"
 #include <cassert>
 #include <chrono>
 #include <string>
@@ -33,14 +33,14 @@ typedef std::pair<std::string, CountAndDurationType>
     NameAndCountAndDurationType;
 
 struct Entry {
-  TimePointType Start;
+  const TimePointType Start;
   TimePointType End;
-  std::string Name;
-  std::string Detail;
+  const std::string Name;
+  const std::string Detail;
 
   Entry(TimePointType &&S, TimePointType &&E, std::string &&N, std::string &&Dt)
       : Start(std::move(S)), End(std::move(E)), Name(std::move(N)),
-        Detail(std::move(Dt)){};
+        Detail(std::move(Dt)) {}
 
   // Calculate timings for FlameGraph. Cast time points to microsecond precision
   // rather than casting duration. This avoid truncation issues causing inner
@@ -59,10 +59,9 @@ struct Entry {
 };
 
 struct TimeTraceProfiler {
-  TimeTraceProfiler(unsigned TimeTraceGranularity = 0)
-      : TimeTraceGranularity(TimeTraceGranularity) {
-    StartTime = steady_clock::now();
-  }
+  TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
+      : StartTime(steady_clock::now()), ProcName(ProcName),
+        TimeTraceGranularity(TimeTraceGranularity) {}
 
   void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
     Stack.emplace_back(steady_clock::now(), TimePointType(), std::move(Name),
@@ -169,7 +168,7 @@ struct TimeTraceProfiler {
       J.attribute("ts", 0);
       J.attribute("ph", "M");
       J.attribute("name", "process_name");
-      J.attributeObject("args", [&] { J.attribute("name", "clang"); });
+      J.attributeObject("args", [&] { J.attribute("name", ProcName); });
     });
 
     J.arrayEnd();
@@ -180,16 +179,19 @@ struct TimeTraceProfiler {
   SmallVector<Entry, 16> Stack;
   SmallVector<Entry, 128> Entries;
   StringMap<CountAndDurationType> CountAndTotalPerName;
-  TimePointType StartTime;
+  const TimePointType StartTime;
+  const std::string ProcName;
 
   // Minimum time granularity (in microseconds)
-  unsigned TimeTraceGranularity;
+  const unsigned TimeTraceGranularity;
 };
 
-void timeTraceProfilerInitialize(unsigned TimeTraceGranularity) {
+void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
+                                 StringRef ProcName) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
-  TimeTraceProfilerInstance = new TimeTraceProfiler(TimeTraceGranularity);
+  TimeTraceProfilerInstance = new TimeTraceProfiler(
+      TimeTraceGranularity, llvm::sys::path::filename(ProcName));
 }
 
 void timeTraceProfilerCleanup() {
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 8c26fa9b8f29e..2b31672670c89 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -361,16 +361,16 @@ static RETSIGTYPE SignalHandler(int Sig) {
   {
     RemoveFilesToRemove();
 
+    if (Sig == SIGPIPE)
+      if (auto OldOneShotPipeFunction =
+              OneShotPipeSignalFunction.exchange(nullptr))
+        return OldOneShotPipeFunction();
+
     if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
         != std::end(IntSigs)) {
       if (auto OldInterruptFunction = InterruptFunction.exchange(nullptr))
         return OldInterruptFunction();
 
-      if (Sig == SIGPIPE)
-        if (auto OldOneShotPipeFunction =
-                OneShotPipeSignalFunction.exchange(nullptr))
-          return OldOneShotPipeFunction();
-
       raise(Sig);   // Execute the default handler.
       return;
    }
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index da156d6084178..054ef8f482ca9 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -352,8 +352,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     }
 
     // Check for flag reads and clobbers.
-    MIOperands::PhysRegInfo PRI =
-        MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
+    PhysRegInfo PRI = AnalyzePhysRegInBundle(*I, AArch64::NZCV, TRI);
 
     if (PRI.Read) {
       // The ccmp doesn't produce exactly the same flags as the original
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index eca9b1e75c2ac..8f88198203d74 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -452,9 +452,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
-  if (MF.getFunction().hasOptSize())
-    return false;
-
   if (AFI->getLocalStackSize() == 0)
     return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 9e8df33218b88..db00f81e53eda 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -828,6 +828,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
         setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
     }
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+    setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -1333,6 +1335,14 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::SUNPKLO:           return "AArch64ISD::SUNPKLO";
   case AArch64ISD::UUNPKHI:           return "AArch64ISD::UUNPKHI";
   case AArch64ISD::UUNPKLO:           return "AArch64ISD::UUNPKLO";
+  case AArch64ISD::INSR:              return "AArch64ISD::INSR";
+  case AArch64ISD::GLD1:              return "AArch64ISD::GLD1";
+  case AArch64ISD::GLD1_SCALED:       return "AArch64ISD::GLD1_SCALED";
+  case AArch64ISD::GLD1_SXTW:         return "AArch64ISD::GLD1_SXTW";
+  case AArch64ISD::GLD1_UXTW:         return "AArch64ISD::GLD1_UXTW";
+  case AArch64ISD::GLD1_SXTW_SCALED:  return "AArch64ISD::GLD1_SXTW_SCALED";
+  case AArch64ISD::GLD1_UXTW_SCALED:  return "AArch64ISD::GLD1_UXTW_SCALED";
+  case AArch64ISD::GLD1_IMM:          return "AArch64ISD::GLD1_IMM";
   }
   return nullptr;
 }
@@ -2884,6 +2894,16 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
                        Op.getOperand(1));
 
+  case Intrinsic::aarch64_sve_insr: {
+    SDValue Scalar = Op.getOperand(2);
+    EVT ScalarTy = Scalar.getValueType();
+    if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+      Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
+
+    return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
+                       Op.getOperand(1), Scalar);
+  }
+
   case Intrinsic::localaddress: {
     const auto &MF = DAG.getMachineFunction();
     const auto *RegInfo = Subtarget->getRegisterInfo();
@@ -11747,6 +11767,85 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(MinOffset, DL, MVT::i64));
 }
 
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+  assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+  switch (ContentTy.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("No known SVE container for this MVT type");
+  case MVT::nxv2i8:
+  case MVT::nxv2i16:
+  case MVT::nxv2i32:
+  case MVT::nxv2i64:
+  case MVT::nxv2f32:
+  case MVT::nxv2f64:
+    return MVT::nxv2i64;
+  case MVT::nxv4i8:
+  case MVT::nxv4i16:
+  case MVT::nxv4i32:
+  case MVT::nxv4f32:
+    return MVT::nxv4i32;
+  }
+}
+
+static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
+                                       unsigned Opcode) {
+  EVT RetVT = N->getValueType(0);
+  assert(RetVT.isScalableVector() &&
+         "Gather loads are only possible for SVE vectors");
+
+  SDLoc DL(N);
+  MVT RetElVT = RetVT.getVectorElementType().getSimpleVT();
+  unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits();
+
+  EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements);
+  if (RetVT.getSizeInBits().getKnownMinSize() >
+      MaxVT.getSizeInBits().getKnownMinSize())
+    return SDValue();
+
+  // Depending on the addressing mode, this is either a pointer or a vector of
+  // pointers (that fits into one register)
+  const SDValue Base = N->getOperand(3);
+  // Depending on the addressing mode, this is either a single offset or a
+  // vector of offsets  (that fits into one register)
+  const SDValue Offset = N->getOperand(4);
+
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) ||
+      !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType()))
+    return SDValue();
+
+  // Return value type that is representable in hardware
+  EVT HwRetVt = getSVEContainerType(RetVT);
+
+  // Keep the original output value type around - this will better inform
+  // optimisations (e.g. instruction folding when load is followed by
+  // zext/sext). This will only be used for ints, so the value for FPs
+  // doesn't matter.
+  SDValue OutVT = DAG.getValueType(RetVT);
+  if (RetVT.isFloatingPoint())
+    OutVT = DAG.getValueType(HwRetVt);
+
+  SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
+  SDValue Ops[] = {N->getOperand(0), // Chain
+                   N->getOperand(2), // Pg
+                   Base, Offset, OutVT};
+
+  SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
+  SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+  if (RetVT.isInteger() && (RetVT != HwRetVt))
+    Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
+
+  // If the original return value was FP, bitcast accordingly. Doing it here
+  // means that we can avoid adding TableGen patterns for FPs.
+  if (RetVT.isFloatingPoint())
+    Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
+
+  return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -11833,6 +11932,20 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::aarch64_neon_st3lane:
     case Intrinsic::aarch64_neon_st4lane:
       return performNEONPostLDSTCombine(N, DCI, DAG);
+    case Intrinsic::aarch64_sve_ld1_gather:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+    case Intrinsic::aarch64_sve_ld1_gather_index:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+    case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW);
+    case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW);
+    case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED);
+    case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED);
+    case Intrinsic::aarch64_sve_ld1_gather_imm:
+      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
     default:
       break;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 384c7b4456f0a..118ab7f3d25e6 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -196,6 +196,17 @@ enum NodeType : unsigned {
   UUNPKHI,
   UUNPKLO,
 
+  INSR,
+
+  // Unsigned gather loads.
+  GLD1,
+  GLD1_SCALED,
+  GLD1_UXTW,
+  GLD1_SXTW,
+  GLD1_UXTW_SCALED,
+  GLD1_SXTW_SCALED,
+  GLD1_IMM,
+
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index d6bf9bcd805ca..fee825422ca4f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -358,6 +358,16 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
 def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
 
+def UImmS2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
+}]>;
+def UImmS4XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
+}]>;
+def UImmS8XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
+}]>;
+
 // uimm5sN predicate - True if the immediate is a multiple of N in the range
 // [0 * N, 32 * N].
 def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
@@ -365,17 +375,20 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
 def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
 
 def uimm5s2 : Operand<i64>, ImmLeaf<i64,
-                [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
+                [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
+                UImmS2XForm> {
   let ParserMatchClass = UImm5s2Operand;
   let PrintMethod = "printImmScale<2>";
 }
 def uimm5s4 : Operand<i64>, ImmLeaf<i64,
-                [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
+                [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
+                UImmS4XForm> {
   let ParserMatchClass = UImm5s4Operand;
   let PrintMethod = "printImmScale<4>";
 }
 def uimm5s8 : Operand<i64>, ImmLeaf<i64,
-                [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
+                [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
+                UImmS8XForm> {
   let ParserMatchClass = UImm5s8Operand;
   let PrintMethod = "printImmScale<8>";
 }
@@ -1473,7 +1486,7 @@ multiclass AuthLoad<bit M, string asm, Operand opr> {
                   (!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>;
 
   def : InstAlias<asm # "\t$Rt, [$wback]!",
-                  (!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0)>;
+                  (!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>;
 }
 
 //---
@@ -10407,9 +10420,9 @@ class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
   let Inst{11-10} = op1;
 }
 class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
-  : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d}">;
+  : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d|.2d\t$Vd, $Vn}">;
 class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
-  : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s}">;
+  : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s|.4s\t$Vd, $Vn}">;
 
 class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
                 string asmops, string cst>
@@ -10424,19 +10437,19 @@ class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
 }
 class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
   : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
-              "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "">;
+              "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "">;
 class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
   : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
-              "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "$Vd = $Vdst">;
+              "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
 class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
   : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
-              "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "">;
+              "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "">;
 class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
   : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
-              "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "$Vd = $Vdst">;
+              "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
 class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
   : CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
-              asm, "{\t$Vd, $Vn, $Vm.2d}", "$Vd = $Vdst">;
+              asm, "{\t$Vd, $Vn, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
 
 class CryptoRRRR<bits<2>op0, string asm, string asmops>
   : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
@@ -10450,15 +10463,18 @@ class CryptoRRRR<bits<2>op0, string asm, string asmops>
   let Inst{14-10} = Va;
 }
 class CryptoRRRR_16B<bits<2>op0, string asm>
- : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b}"> {
+ : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b" #
+                        "|.16b\t$Vd, $Vn, $Vm, $Va}"> {
 }
 class CryptoRRRR_4S<bits<2>op0, string asm>
- : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s}"> {
+ : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s" #
+                         "|.4s\t$Vd, $Vn, $Vm, $Va}"> {
 }
 
 class CryptoRRRi6<string asm>
   : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
-                  "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> {
+                  "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" #
+                  "|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> {
   bits<6> imm;
   bits<5> Vm;
   let Inst{24-21} = 0b0100;
@@ -10471,7 +10487,8 @@ class CryptoRRRi6<string asm>
 class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
   : BaseCryptoV82<(outs V128:$Vdst),
                   (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
-                  asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> {
+                  asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" #
+                       "|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> {
   bits<2> imm;
   bits<5> Vm;
   let Inst{24-21} = 0b0010;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 785345422404f..714007f8aba86 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3571,6 +3571,18 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
   // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
   case AArch64::SUBXri:
   case AArch64::SUBSXri:
+  case AArch64::ADDv8i8:
+  case AArch64::ADDv16i8:
+  case AArch64::ADDv4i16:
+  case AArch64::ADDv8i16:
+  case AArch64::ADDv2i32:
+  case AArch64::ADDv4i32:
+  case AArch64::SUBv8i8:
+  case AArch64::SUBv16i8:
+  case AArch64::SUBv4i16:
+  case AArch64::SUBv8i16:
+  case AArch64::SUBv2i32:
+  case AArch64::SUBv4i32:
     return true;
   default:
     break;
@@ -3713,6 +3725,13 @@ static bool getMaddPatterns(MachineInstr &Root,
     }
   };
 
+  auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
+    if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
+      Patterns.push_back(Pattern);
+      Found = true;
+    }
+  };
+
   typedef MachineCombinerPattern MCP;
 
   switch (Opc) {
@@ -3748,6 +3767,70 @@ static bool getMaddPatterns(MachineInstr &Root,
   case AArch64::SUBXri:
     setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
     break;
+  case AArch64::ADDv8i8:
+    setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
+    setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
+    break;
+  case AArch64::ADDv16i8:
+    setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
+    setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
+    break;
+  case AArch64::ADDv4i16:
+    setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
+    setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
+    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
+    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
+    break;
+  case AArch64::ADDv8i16:
+    setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
+    setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
+    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
+    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
+    break;
+  case AArch64::ADDv2i32:
+    setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
+    setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
+    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
+    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
+    break;
+  case AArch64::ADDv4i32:
+    setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
+    setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
+    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
+    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
+    break;
+  case AArch64::SUBv8i8:
+    setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
+    setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
+    break;
+  case AArch64::SUBv16i8:
+    setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
+    setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
+    break;
+  case AArch64::SUBv4i16:
+    setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
+    setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
+    setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
+    setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
+    break;
+  case AArch64::SUBv8i16:
+    setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
+    setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
+    setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
+    setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
+    break;
+  case AArch64::SUBv2i32:
+    setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
+    setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
+    setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
+    setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
+    break;
+  case AArch64::SUBv4i32:
+    setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
+    setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
+    setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
+    setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
+    break;
   }
   return Found;
 }
@@ -3960,6 +4043,46 @@ bool AArch64InstrInfo::isThroughputPattern(
   case MachineCombinerPattern::FMLSv2f64_OP2:
   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
   case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::MULADDv8i8_OP1:
+  case MachineCombinerPattern::MULADDv8i8_OP2:
+  case MachineCombinerPattern::MULADDv16i8_OP1:
+  case MachineCombinerPattern::MULADDv16i8_OP2:
+  case MachineCombinerPattern::MULADDv4i16_OP1:
+  case MachineCombinerPattern::MULADDv4i16_OP2:
+  case MachineCombinerPattern::MULADDv8i16_OP1:
+  case MachineCombinerPattern::MULADDv8i16_OP2:
+  case MachineCombinerPattern::MULADDv2i32_OP1:
+  case MachineCombinerPattern::MULADDv2i32_OP2:
+  case MachineCombinerPattern::MULADDv4i32_OP1:
+  case MachineCombinerPattern::MULADDv4i32_OP2:
+  case MachineCombinerPattern::MULSUBv8i8_OP1:
+  case MachineCombinerPattern::MULSUBv8i8_OP2:
+  case MachineCombinerPattern::MULSUBv16i8_OP1:
+  case MachineCombinerPattern::MULSUBv16i8_OP2:
+  case MachineCombinerPattern::MULSUBv4i16_OP1:
+  case MachineCombinerPattern::MULSUBv4i16_OP2:
+  case MachineCombinerPattern::MULSUBv8i16_OP1:
+  case MachineCombinerPattern::MULSUBv8i16_OP2:
+  case MachineCombinerPattern::MULSUBv2i32_OP1:
+  case MachineCombinerPattern::MULSUBv2i32_OP2:
+  case MachineCombinerPattern::MULSUBv4i32_OP1:
+  case MachineCombinerPattern::MULSUBv4i32_OP2:
+  case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
+  case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
+  case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
+  case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
+  case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
+  case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
+  case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
+  case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
+  case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
+  case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
+  case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
+  case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
+  case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
+  case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
+  case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
+  case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
     return true;
   } // end switch (Pattern)
   return false;
@@ -4063,6 +4186,30 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
   return MUL;
 }
 
+/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
+/// instructions.
+///
+/// \see genFusedMultiply
+static MachineInstr *genFusedMultiplyAcc(
+    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
+  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+                          FMAInstKind::Accumulator);
+}
+
+/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
+/// instructions.
+///
+/// \see genFusedMultiply
+static MachineInstr *genFusedMultiplyIdx(
+    MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
+  return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+                          FMAInstKind::Indexed);
+}
+
 /// genMaddR - Generate madd instruction and combine mul and add using
 /// an extra virtual register
 /// Example - an ADD intermediate needs to be stored in a register:
@@ -4302,6 +4449,211 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+
+  case MachineCombinerPattern::MULADDv8i8_OP1:
+    Opc = AArch64::MLAv8i8;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv8i8_OP2:
+    Opc = AArch64::MLAv8i8;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv16i8_OP1:
+    Opc = AArch64::MLAv16i8;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv16i8_OP2:
+    Opc = AArch64::MLAv16i8;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i16_OP1:
+    Opc = AArch64::MLAv4i16;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i16_OP2:
+    Opc = AArch64::MLAv4i16;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv8i16_OP1:
+    Opc = AArch64::MLAv8i16;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv8i16_OP2:
+    Opc = AArch64::MLAv8i16;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv2i32_OP1:
+    Opc = AArch64::MLAv2i32;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv2i32_OP2:
+    Opc = AArch64::MLAv2i32;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i32_OP1:
+    Opc = AArch64::MLAv4i32;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i32_OP2:
+    Opc = AArch64::MLAv4i32;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::MULSUBv8i8_OP1:
+    Opc = AArch64::MLSv8i8;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv8i8_OP2:
+    Opc = AArch64::MLSv8i8;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv16i8_OP1:
+    Opc = AArch64::MLSv16i8;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv16i8_OP2:
+    Opc = AArch64::MLSv16i8;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i16_OP1:
+    Opc = AArch64::MLSv4i16;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i16_OP2:
+    Opc = AArch64::MLSv4i16;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv8i16_OP1:
+    Opc = AArch64::MLSv8i16;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv8i16_OP2:
+    Opc = AArch64::MLSv8i16;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv2i32_OP1:
+    Opc = AArch64::MLSv2i32;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv2i32_OP2:
+    Opc = AArch64::MLSv2i32;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i32_OP1:
+    Opc = AArch64::MLSv4i32;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i32_OP2:
+    Opc = AArch64::MLSv4i32;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
+    Opc = AArch64::MLAv4i16_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
+    Opc = AArch64::MLAv4i16_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
+    Opc = AArch64::MLAv8i16_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
+    Opc = AArch64::MLAv8i16_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
+    Opc = AArch64::MLAv2i32_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
+    Opc = AArch64::MLAv2i32_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
+    Opc = AArch64::MLAv4i32_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
+    Opc = AArch64::MLAv4i32_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
+  case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
+    Opc = AArch64::MLSv4i16_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
+    Opc = AArch64::MLSv4i16_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
+    Opc = AArch64::MLSv8i16_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
+    Opc = AArch64::MLSv8i16_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
+    Opc = AArch64::MLSv2i32_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
+    Opc = AArch64::MLSv2i32_indexed;
+    RC = &AArch64::FPR64RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
+    Opc = AArch64::MLSv4i32_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+    break;
+  case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+    Opc = AArch64::MLSv4i32_indexed;
+    RC = &AArch64::FPR128RegClass;
+    MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+    break;
+
   // Floating Point Support
   case MachineCombinerPattern::FMULADDH_OP1:
     Opc = AArch64::FMADDHrrr;
@@ -5060,8 +5412,99 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
   return 0u;
 }
 
-outliner::OutlinedFunction
-AArch64InstrInfo::getOutliningCandidateInfo(
+static bool
+outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
+                                         const outliner::Candidate &b) {
+  const Function &Fa = a.getMF()->getFunction();
+  const Function &Fb = b.getMF()->getFunction();
+
+  // If none of the functions have the "sign-return-address" attribute their
+  // signing behaviour is equal
+  if (!Fa.hasFnAttribute("sign-return-address") &&
+      !Fb.hasFnAttribute("sign-return-address")) {
+    return true;
+  }
+
+  // If both functions have the "sign-return-address" attribute their signing
+  // behaviour is equal, if the values of the attributes are equal
+  if (Fa.hasFnAttribute("sign-return-address") &&
+      Fb.hasFnAttribute("sign-return-address")) {
+    StringRef ScopeA =
+        Fa.getFnAttribute("sign-return-address").getValueAsString();
+    StringRef ScopeB =
+        Fb.getFnAttribute("sign-return-address").getValueAsString();
+    return ScopeA.equals(ScopeB);
+  }
+
+  // If function B doesn't have the "sign-return-address" attribute but A does,
+  // the functions' signing behaviour is equal if A's value for
+  // "sign-return-address" is "none" and vice versa.
+  if (Fa.hasFnAttribute("sign-return-address")) {
+    StringRef ScopeA =
+        Fa.getFnAttribute("sign-return-address").getValueAsString();
+    return ScopeA.equals("none");
+  }
+
+  if (Fb.hasFnAttribute("sign-return-address")) {
+    StringRef ScopeB =
+        Fb.getFnAttribute("sign-return-address").getValueAsString();
+    return ScopeB.equals("none");
+  }
+
+  llvm_unreachable("Unkown combination of sign-return-address attributes");
+}
+
+static bool
+outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
+                                       const outliner::Candidate &b) {
+  const Function &Fa = a.getMF()->getFunction();
+  const Function &Fb = b.getMF()->getFunction();
+
+  // If none of the functions have the "sign-return-address-key" attribute
+  // their keys are equal
+  if (!Fa.hasFnAttribute("sign-return-address-key") &&
+      !Fb.hasFnAttribute("sign-return-address-key")) {
+    return true;
+  }
+
+  // If both functions have the "sign-return-address-key" attribute their
+  // keys are equal if the values of "sign-return-address-key" are equal
+  if (Fa.hasFnAttribute("sign-return-address-key") &&
+      Fb.hasFnAttribute("sign-return-address-key")) {
+    StringRef KeyA =
+        Fa.getFnAttribute("sign-return-address-key").getValueAsString();
+    StringRef KeyB =
+        Fb.getFnAttribute("sign-return-address-key").getValueAsString();
+    return KeyA.equals(KeyB);
+  }
+
+  // If B doesn't have the "sign-return-address-key" attribute, both keys are
+  // equal, if function a has the default key (a_key)
+  if (Fa.hasFnAttribute("sign-return-address-key")) {
+    StringRef KeyA =
+        Fa.getFnAttribute("sign-return-address-key").getValueAsString();
+    return KeyA.equals_lower("a_key");
+  }
+
+  if (Fb.hasFnAttribute("sign-return-address-key")) {
+    StringRef KeyB =
+        Fb.getFnAttribute("sign-return-address-key").getValueAsString();
+    return KeyB.equals_lower("a_key");
+  }
+
+  llvm_unreachable("Unkown combination of sign-return-address-key attributes");
+}
+
+static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
+                                                const outliner::Candidate &b) {
+  const AArch64Subtarget &SubtargetA =
+      a.getMF()->getSubtarget<AArch64Subtarget>();
+  const AArch64Subtarget &SubtargetB =
+      b.getMF()->getSubtarget<AArch64Subtarget>();
+  return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
+}
+
+outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
   unsigned SequenceSize =
@@ -5069,12 +5512,99 @@ AArch64InstrInfo::getOutliningCandidateInfo(
                       [this](unsigned Sum, const MachineInstr &MI) {
                         return Sum + getInstSizeInBytes(MI);
                       });
+  unsigned NumBytesToCreateFrame = 0;
+
+  // We only allow outlining for functions having exactly matching return
+  // address signing attributes, i.e., all share the same value for the
+  // attribute "sign-return-address" and all share the same type of key they
+  // are signed with.
+  // Additionally we require all functions to simultaniously either support
+  // v8.3a features or not. Otherwise an outlined function could get signed
+  // using dedicated v8.3 instructions and a call from a function that doesn't
+  // support v8.3 instructions would therefore be invalid.
+  if (std::adjacent_find(
+          RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+          [](const outliner::Candidate &a, const outliner::Candidate &b) {
+            // Return true if a and b are non-equal w.r.t. return address
+            // signing or support of v8.3a features
+            if (outliningCandidatesSigningScopeConsensus(a, b) &&
+                outliningCandidatesSigningKeyConsensus(a, b) &&
+                outliningCandidatesV8_3OpsConsensus(a, b)) {
+              return false;
+            }
+            return true;
+          }) != RepeatedSequenceLocs.end()) {
+    return outliner::OutlinedFunction();
+  }
+
+  // Since at this point all candidates agree on their return address signing
+  // picking just one is fine. If the candidate functions potentially sign their
+  // return addresses, the outlined function should do the same. Note that in
+  // the case of "sign-return-address"="non-leaf" this is an assumption: It is
+  // not certainly true that the outlined function will have to sign its return
+  // address but this decision is made later, when the decision to outline
+  // has already been made.
+  // The same holds for the number of additional instructions we need: On
+  // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
+  // necessary. However, at this point we don't know if the outlined function
+  // will have a RET instruction so we assume the worst.
+  const Function &FCF = FirstCand.getMF()->getFunction();
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  if (FCF.hasFnAttribute("sign-return-address")) {
+    // One PAC and one AUT instructions
+    NumBytesToCreateFrame += 8;
+
+    // We have to check if sp modifying instructions would get outlined.
+    // If so we only allow outlining if sp is unchanged overall, so matching
+    // sub and add instructions are okay to outline, all other sp modifications
+    // are not
+    auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
+      int SPValue = 0;
+      MachineBasicBlock::iterator MBBI = C.front();
+      for (;;) {
+        if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
+          switch (MBBI->getOpcode()) {
+          case AArch64::ADDXri:
+          case AArch64::ADDWri:
+            assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
+            assert(MBBI->getOperand(2).isImm() &&
+                   "Expected operand to be immediate");
+            SPValue += MBBI->getOperand(2).getImm();
+            break;
+          case AArch64::SUBXri:
+          case AArch64::SUBWri:
+            assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
+            assert(MBBI->getOperand(2).isImm() &&
+                   "Expected operand to be immediate");
+            SPValue -= MBBI->getOperand(2).getImm();
+            break;
+          default:
+            return true;
+          }
+        }
+        if (MBBI == C.back())
+          break;
+        ++MBBI;
+      }
+      if (SPValue)
+        return true;
+      return false;
+    };
+    // Remove candidates with illegal stack modifying instructions
+    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+                                              RepeatedSequenceLocs.end(),
+                                              hasIllegalSPModification),
+                               RepeatedSequenceLocs.end());
+
+    // If the sequence doesn't have enough candidates left, then we're done.
+    if (RepeatedSequenceLocs.size() < 2)
+      return outliner::OutlinedFunction();
+  }
 
   // Properties about candidate MBBs that hold for all of them.
   unsigned FlagsSetInAll = 0xF;
 
   // Compute liveness information for each candidate, and set FlagsSetInAll.
-  const TargetRegisterInfo &TRI = getRegisterInfo();
   std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
                 [&FlagsSetInAll](outliner::Candidate &C) {
                   FlagsSetInAll &= C.Flags;
@@ -5130,7 +5660,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
       };
 
   unsigned FrameID = MachineOutlinerDefault;
-  unsigned NumBytesToCreateFrame = 4;
+  NumBytesToCreateFrame += 4;
 
   bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
@@ -5399,6 +5929,19 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
   MachineFunction *MF = MBB->getParent();
   AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
 
+  // Don't outline anything used for return address signing. The outlined
+  // function will get signed later if needed
+  switch (MI.getOpcode()) {
+  case AArch64::PACIASP:
+  case AArch64::PACIBSP:
+  case AArch64::AUTIASP:
+  case AArch64::AUTIBSP:
+  case AArch64::RETAA:
+  case AArch64::RETAB:
+  case AArch64::EMITBKEY:
+    return outliner::InstrType::Illegal;
+  }
+
   // Don't outline LOHs.
   if (FuncInfo->getLOHRelated().count(&MI))
     return outliner::InstrType::Illegal;
@@ -5551,6 +6094,59 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
   }
 }
 
+static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 bool ShouldSignReturnAddr,
+                                 bool ShouldSignReturnAddrWithAKey) {
+  if (ShouldSignReturnAddr) {
+    MachineBasicBlock::iterator MBBPAC = MBB.begin();
+    MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
+    const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+    const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+    DebugLoc DL;
+
+    if (MBBAUT != MBB.end())
+      DL = MBBAUT->getDebugLoc();
+
+    // At the very beginning of the basic block we insert the following
+    // depending on the key type
+    //
+    // a_key:                   b_key:
+    //    PACIASP                   EMITBKEY
+    //    CFI_INSTRUCTION           PACIBSP
+    //                              CFI_INSTRUCTION
+    if (ShouldSignReturnAddrWithAKey) {
+      BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
+          .setMIFlag(MachineInstr::FrameSetup);
+    } else {
+      BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
+          .setMIFlag(MachineInstr::FrameSetup);
+      BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+    BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    // If v8.3a features are available we can replace a RET instruction by
+    // RETAA or RETAB and omit the AUT instructions
+    if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
+        MBBAUT->getOpcode() == AArch64::RET) {
+      BuildMI(MBB, MBBAUT, DL,
+              TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
+                                                    : AArch64::RETAB))
+          .copyImplicitOps(*MBBAUT);
+      MBB.erase(MBBAUT);
+    } else {
+      BuildMI(MBB, MBBAUT, DL,
+              TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
+                                                    : AArch64::AUTIBSP))
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
+  }
+}
+
 void AArch64InstrInfo::buildOutlinedFrame(
     MachineBasicBlock &MBB, MachineFunction &MF,
     const outliner::OutlinedFunction &OF) const {
@@ -5566,16 +6162,19 @@ void AArch64InstrInfo::buildOutlinedFrame(
       TailOpcode = AArch64::TCRETURNriALL;
     }
     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
-                            .add(Call->getOperand(0))
-                            .addImm(0);
+                           .add(Call->getOperand(0))
+                           .addImm(0);
     MBB.insert(MBB.end(), TC);
     Call->eraseFromParent();
   }
 
+  bool IsLeafFunction = true;
+
   // Is there a call in the outlined range?
-  auto IsNonTailCall = [](MachineInstr &MI) {
+  auto IsNonTailCall = [](const MachineInstr &MI) {
     return MI.isCall() && !MI.isReturn();
   };
+
   if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
     // Fix up the instructions in the range, since we're going to modify the
     // stack.
@@ -5583,6 +6182,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
            "Can only fix up stack references once");
     fixupPostOutline(MBB);
 
+    IsLeafFunction = false;
+
     // LR has to be a live in so that we can save it.
     MBB.addLiveIn(AArch64::LR);
 
@@ -5629,16 +6230,47 @@ void AArch64InstrInfo::buildOutlinedFrame(
     Et = MBB.insert(Et, LDRXpost);
   }
 
+  // If a bunch of candidates reach this point they must agree on their return
+  // address signing. It is therefore enough to just consider the signing
+  // behaviour of one of them
+  const Function &CF = OF.Candidates.front().getMF()->getFunction();
+  bool ShouldSignReturnAddr = false;
+  if (CF.hasFnAttribute("sign-return-address")) {
+    StringRef Scope =
+        CF.getFnAttribute("sign-return-address").getValueAsString();
+    if (Scope.equals("all"))
+      ShouldSignReturnAddr = true;
+    else if (Scope.equals("non-leaf") && !IsLeafFunction)
+      ShouldSignReturnAddr = true;
+  }
+
+  // a_key is the default
+  bool ShouldSignReturnAddrWithAKey = true;
+  if (CF.hasFnAttribute("sign-return-address-key")) {
+    const StringRef Key =
+        CF.getFnAttribute("sign-return-address-key").getValueAsString();
+    // Key can either be a_key or b_key
+    assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
+           "Return address signing key must be either a_key or b_key");
+    ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
+  }
+
   // If this is a tail call outlined function, then there's already a return.
   if (OF.FrameConstructionID == MachineOutlinerTailCall ||
-      OF.FrameConstructionID == MachineOutlinerThunk)
+      OF.FrameConstructionID == MachineOutlinerThunk) {
+    signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
+                         ShouldSignReturnAddrWithAKey);
     return;
+  }
 
   // It's not a tail call, so we have to insert the return ourselves.
   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
                           .addReg(AArch64::LR, RegState::Undef);
   MBB.insert(MBB.end(), ret);
 
+  signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
+                       ShouldSignReturnAddrWithAKey);
+
   // Did we have to modify the stack by saving the link register?
   if (OF.FrameConstructionID != MachineOutlinerDefault)
     return;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 80cf31ff3d56b..48872dc09cdb5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -214,6 +214,7 @@ def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
                                     SDTCisSameAs<0, 1>]>;
 def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Insr  : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
 def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                           SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<0, 2>]>;
@@ -262,15 +263,17 @@ def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
 // non-extending masked load fragment.
 def nonext_masked_load :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (masked_ld node:$ptr, node:$pred, node:$def), [{
-  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+          (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+         cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
 // sign extending masked load fragments.
 def asext_masked_load :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (masked_ld node:$ptr, node:$pred, node:$def),[{
-  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
-         cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
+          (masked_ld node:$ptr, undef, node:$pred, node:$def),[{
+  return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
+          cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) &&
+         cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
 def asext_masked_load_i8 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
@@ -290,8 +293,9 @@ def asext_masked_load_i32 :
 // zero extending masked load fragments.
 def zext_masked_load :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
-          (masked_ld node:$ptr, node:$pred, node:$def), [{
-  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
+          (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+  return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD &&
+         cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
 def zext_masked_load_i8 :
   PatFrag<(ops node:$ptr, node:$pred, node:$def),
@@ -312,14 +316,16 @@ def zext_masked_load_i32 :
 // non-truncating masked store fragment.
 def nontrunc_masked_store :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
-          (masked_st node:$val, node:$ptr, node:$pred), [{
-  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+          (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+         cast<MaskedStoreSDNode>(N)->isUnindexed();
 }]>;
 // truncating masked store fragments.
 def trunc_masked_store :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
-          (masked_st node:$val, node:$ptr, node:$pred), [{
-  return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+          (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+         cast<MaskedStoreSDNode>(N)->isUnindexed();
 }]>;
 def trunc_masked_store_i8 :
   PatFrag<(ops node:$val, node:$ptr, node:$pred),
@@ -396,6 +402,8 @@ def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
 def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
 def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
 
+def AArch64insr      : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
+
 def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
 def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
 def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
@@ -752,6 +760,29 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
 defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
                                        null_frag>;
 
+let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+  def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>;
+  def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>;
+  def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>;
+  def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
+}
+let Predicates = [HasComplxNum, HasNEON] in {
+  def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+            (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
+  def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+            (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>;
+  foreach Ty = [v4f32, v2f64] in {
+    def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))),
+              (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>;
+    def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))),
+              (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>;
+  }
+}
+
 // v8.3a Pointer Authentication
 // These instructions inhabit part of the hint space and so can be used for
 // armv8 targets
@@ -3793,10 +3824,11 @@ defm FMUL     : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
 defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
 defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
 defm FSUB     : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
-defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
-                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
-                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+
+// MLA and MLS are generated in MachineCombine
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
+
 defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
 defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
@@ -5526,10 +5558,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
 
 defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
-defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
-              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
-              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+
+// Generated by MachineCombine
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
+
 defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
 defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index a4ea2cab13eba..c75208e4aaca6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -10,6 +10,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1_gather                : SDNode<"AArch64ISD::GLD1",               SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_scaled         : SDNode<"AArch64ISD::GLD1_SCALED",        SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_uxtw           : SDNode<"AArch64ISD::GLD1_UXTW",          SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_sxtw           : SDNode<"AArch64ISD::GLD1_SXTW",          SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_uxtw_scaled    : SDNode<"AArch64ISD::GLD1_UXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_sxtw_scaled    : SDNode<"AArch64ISD::GLD1_SXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_imm            : SDNode<"AArch64ISD::GLD1_IMM",           SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
 let Predicates = [HasSVE] in {
 
   def RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr">;
@@ -28,7 +46,7 @@ let Predicates = [HasSVE] in {
   defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
   defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
   defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
-  defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", int_aarch64_sve_bic>;
+  defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", int_aarch64_sve_bic_base>;
 
   defm ADD_ZPmZ   : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>;
   defm SUB_ZPmZ   : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>;
@@ -37,7 +55,7 @@ let Predicates = [HasSVE] in {
   defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_or>;
   defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_xor>;
   defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
-  defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic_pred>;
+  defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
 
   defm ADD_ZI   : sve_int_arith_imm0<0b000, "add">;
   defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub">;
@@ -199,14 +217,14 @@ let Predicates = [HasSVE] in {
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
   defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
-  defm INSR_ZR : sve_int_perm_insrs<"insr">;
-  defm INSR_ZV : sve_int_perm_insrv<"insr">;
+  defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
+  defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   def  EXT_ZZI : sve_int_perm_extract_i<"ext">;
 
-  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
-  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
-  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
-  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;
+  defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
+  defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
+  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
+  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
 
   defm REV_PP : sve_int_perm_reverse_p<"rev">;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
@@ -244,21 +262,21 @@ let Predicates = [HasSVE] in {
   defm PFIRST  : sve_int_pfirst<0b00000, "pfirst">;
   defm PNEXT   : sve_int_pnext<0b00110, "pnext">;
 
-  def AND_PPzPP   : sve_int_pred_log<0b0000, "and">;
-  def BIC_PPzPP   : sve_int_pred_log<0b0001, "bic">;
-  def EOR_PPzPP   : sve_int_pred_log<0b0010, "eor">;
-  def SEL_PPPP    : sve_int_pred_log<0b0011, "sel">;
-  def ANDS_PPzPP  : sve_int_pred_log<0b0100, "ands">;
-  def BICS_PPzPP  : sve_int_pred_log<0b0101, "bics">;
-  def EORS_PPzPP  : sve_int_pred_log<0b0110, "eors">;
-  def ORR_PPzPP   : sve_int_pred_log<0b1000, "orr">;
-  def ORN_PPzPP   : sve_int_pred_log<0b1001, "orn">;
-  def NOR_PPzPP   : sve_int_pred_log<0b1010, "nor">;
-  def NAND_PPzPP  : sve_int_pred_log<0b1011, "nand">;
-  def ORRS_PPzPP  : sve_int_pred_log<0b1100, "orrs">;
-  def ORNS_PPzPP  : sve_int_pred_log<0b1101, "orns">;
-  def NORS_PPzPP  : sve_int_pred_log<0b1110, "nors">;
-  def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;
+  defm AND_PPzPP   : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and>;
+  defm BIC_PPzPP   : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic>;
+  defm EOR_PPzPP   : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor>;
+  defm SEL_PPPP    : sve_int_pred_log<0b0011, "sel", vselect>;
+  defm ANDS_PPzPP  : sve_int_pred_log<0b0100, "ands", int_aarch64_sve_ands>;
+  defm BICS_PPzPP  : sve_int_pred_log<0b0101, "bics", int_aarch64_sve_bics>;
+  defm EORS_PPzPP  : sve_int_pred_log<0b0110, "eors", int_aarch64_sve_eors>;
+  defm ORR_PPzPP   : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr>;
+  defm ORN_PPzPP   : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn>;
+  defm NOR_PPzPP   : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor>;
+  defm NAND_PPzPP  : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand>;
+  defm ORRS_PPzPP  : sve_int_pred_log<0b1100, "orrs", int_aarch64_sve_orrs>;
+  defm ORNS_PPzPP  : sve_int_pred_log<0b1101, "orns", int_aarch64_sve_orns>;
+  defm NORS_PPzPP  : sve_int_pred_log<0b1110, "nors", int_aarch64_sve_nors>;
+  defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", int_aarch64_sve_nands>;
 
   defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
   defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
@@ -402,115 +420,115 @@ let Predicates = [HasSVE] in {
 
   // Gathers using unscaled 32-bit offsets, e.g.
   //    ld1h z0.s, p0/z, [x0, z0.s, uxtw]
-  defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
-  defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
-  defm GLD1B_S    : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b",    ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
-  defm GLDFF1B_S  : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b",  ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
-  defm GLD1SH_S   : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh",   ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
-  defm GLD1H_S    : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h",    ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h",  ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
-  defm GLD1W      : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w",    ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
-  defm GLDFF1W    : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w",  ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+  defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   null_frag,                 null_frag,                 ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag,                 null_frag,                 ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLD1B_S    : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b",  null_frag,                 null_frag,                 ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh",   null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h",  null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w",  null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
 
   // Gathers using scaled 32-bit offsets, e.g.
   //    ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
-  defm GLD1SH_S   : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh",   ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
-  defm GLD1H_S    : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h",    ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h",  ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
-  defm GLD1W      : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w",    ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
-  defm GLDFF1W    : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w",  ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
-
-  // Gathers using scaled 32-bit pointers with offset, e.g.
+  defm GLD1SH_S   : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh",   null_frag,                      null_frag,                      ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag,                      null_frag,                      ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled,  AArch64ld1_gather_uxtw_scaled,  ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h",  null_frag,                      null_frag,                      ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled,  AArch64ld1_gather_uxtw_scaled,  ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w",  null_frag,                      null_frag,                      ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+
+  // Gathers using 32-bit pointers with scaled offset, e.g.
   //    ld1h z0.s, p0/z, [z0.s, #16]
-  defm GLD1SB_S   : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb",   imm0_31>;
-  defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
-  defm GLD1B_S    : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b",    imm0_31>;
-  defm GLDFF1B_S  : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b",  imm0_31>;
-  defm GLD1SH_S   : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh",   uimm5s2>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
-  defm GLD1H_S    : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h",    uimm5s2>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h",  uimm5s2>;
-  defm GLD1W      : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w",    uimm5s4>;
-  defm GLDFF1W    : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w",  uimm5s4>;
-
-  // Gathers using scaled 64-bit pointers with offset, e.g.
+  defm GLD1SB_S   : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb",   imm0_31, null_frag,               nxv4i8>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag,               nxv4i8>;
+  defm GLD1B_S    : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm,   nxv4i8>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b",  imm0_31, null_frag,               nxv4i8>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh",   uimm5s2, null_frag,               nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag,               nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm,   nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h",  uimm5s2, null_frag,               nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm,   nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w",  uimm5s4, null_frag,               nxv4i32>;
+
+  // Gathers using 64-bit pointers with scaled offset, e.g.
   //    ld1h z0.d, p0/z, [z0.d, #16]
-  defm GLD1SB_D   : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb",   imm0_31>;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
-  defm GLD1B_D    : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b",    imm0_31>;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b",  imm0_31>;
-  defm GLD1SH_D   : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh",   uimm5s2>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
-  defm GLD1H_D    : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h",    uimm5s2>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h",  uimm5s2>;
-  defm GLD1SW_D   : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw",   uimm5s4>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
-  defm GLD1W_D    : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w",    uimm5s4>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w",  uimm5s4>;
-  defm GLD1D      : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d",    uimm5s8>;
-  defm GLDFF1D    : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d",  uimm5s8>;
+  defm GLD1SB_D   : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb",   imm0_31, null_frag,             nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag,             nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm, nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b",  imm0_31, null_frag,             nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh",   uimm5s2, null_frag,             nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag,             nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h",  uimm5s2, null_frag,             nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw",   uimm5s4, null_frag,             nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag,             nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w",  uimm5s4, null_frag,             nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d",    uimm5s8, AArch64ld1_gather_imm, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d",  uimm5s8, null_frag,             nxv2i64>;
 
   // Gathers using unscaled 64-bit offsets, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d]
-  defm GLD1SB_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
-  defm GLD1B_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
-  defm GLD1SH_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
-  defm GLD1H_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
-  defm GLD1SW_D   : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
-  defm GLD1W_D    : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
-  defm GLD1D      : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
-  defm GLDFF1D    : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
+  defm GLD1SB_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb",   null_frag,         nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag,         nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b",    AArch64ld1_gather, nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b",  null_frag,         nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh",   null_frag,         nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag,         nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h",    AArch64ld1_gather, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h",  null_frag,         nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw",   null_frag,         nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag,         nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w",    AArch64ld1_gather, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w",  null_frag,         nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d",    AArch64ld1_gather, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d",  null_frag,         nxv2i64>;
 
   // Gathers using scaled 64-bit offsets, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
-  defm GLD1SH_D   : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh",   ZPR64ExtLSL16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
-  defm GLD1H_D    : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h",    ZPR64ExtLSL16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h",  ZPR64ExtLSL16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw",   ZPR64ExtLSL32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
-  defm GLD1W_D    : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w",    ZPR64ExtLSL32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w",  ZPR64ExtLSL32>;
-  defm GLD1D      : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d",    ZPR64ExtLSL64>;
-  defm GLDFF1D    : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d",  ZPR64ExtLSL64>;
+  defm GLD1SH_D   : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh",    null_frag,                  ZPR64ExtLSL16, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh",  null_frag,                  ZPR64ExtLSL16, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h",     AArch64ld1_gather_scaled,   ZPR64ExtLSL16, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h",   null_frag,                  ZPR64ExtLSL16, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw",    null_frag,                  ZPR64ExtLSL32, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw",  null_frag,                  ZPR64ExtLSL32, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w",     AArch64ld1_gather_scaled,   ZPR64ExtLSL32, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w",   null_frag,                  ZPR64ExtLSL32, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d",     AArch64ld1_gather_scaled,   ZPR64ExtLSL64, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d",   null_frag,                  ZPR64ExtLSL64, nxv2i64>;
 
   // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, uxtw]
-  defm GLD1SB_D   : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb",   ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
-  defm GLD1B_D    : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b",    ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b",  ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
-  defm GLD1SH_D   : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh",   ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLD1H_D    : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLD1SW_D   : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw",   ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLD1W_D    : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLD1D      : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d",    ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-  defm GLDFF1D    : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d",  ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+  defm GLD1SB_D   : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb",   null_frag,                 null_frag,                 ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag,                 null_frag,                 ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b",  null_frag,                 null_frag,                 ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh",   null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw",   null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
 
   // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
-  defm GLD1SH_D   : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh",  ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
-  defm GLD1H_D    : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h",   ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw",  ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
-  defm GLD1W_D    : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w",   ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
-  defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",   ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
-  defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+  defm GLD1SH_D   : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh",   null_frag, null_frag,                                         ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag,                                         ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h",  null_frag, null_frag,                                         ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw",   null_frag, null_frag,                                         ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag,                                         ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w",  null_frag, null_frag,                                         ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d",  null_frag, null_frag,                                         ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
 
   // Non-temporal contiguous loads (register + immediate)
   defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
@@ -751,15 +769,15 @@ let Predicates = [HasSVE] in {
   defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
   defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
 
-  defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">;
-  defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">;
-  defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">;
-  defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">;
+  defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
+  defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
+  defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
+  defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>;
 
-  defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">;
-  defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">;
-  defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">;
-  defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">;
+  defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
+  defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>;
+  defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
+  defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>;
 
   def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
   def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
@@ -770,11 +788,11 @@ let Predicates = [HasSVE] in {
   def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
   def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
 
-  defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
-  defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
-  defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
-  defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
-  defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;
+  defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>;
+  defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>;
+  defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
+  defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
+  defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
 
   defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
   defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
@@ -876,53 +894,53 @@ let Predicates = [HasSVE] in {
   defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
   defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
   defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
-  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">;
-
-  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr">;
-  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr">;
-  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl">;
-  defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
-  defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
-  defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;
-
-  defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
-  defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
-  defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;
-
-  def FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, ElementSizeS>;
-  def FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, ElementSizeS>;
-  def SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, ElementSizeH>;
-  def SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, ElementSizeS>;
-  def UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, ElementSizeS>;
-  def UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, ElementSizeH>;
-  def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>;
-  def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>;
-  def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>;
-  def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>;
-  def FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, ElementSizeD>;
-  def FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, ElementSizeD>;
-  def FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, ElementSizeD>;
-  def FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, ElementSizeD>;
-  def SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, ElementSizeD>;
-  def UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, ElementSizeD>;
-  def UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, ElementSizeS>;
-  def SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, ElementSizeD>;
-  def SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, ElementSizeS>;
-  def SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, ElementSizeD>;
-  def UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, ElementSizeD>;
-  def UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, ElementSizeD>;
-  def SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, ElementSizeD>;
-  def UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, ElementSizeD>;
-  def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>;
-  def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>;
-  def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>;
-  def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>;
-  def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>;
-  def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>;
-  def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>;
-  def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>;
-  def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>;
-  def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>;
+
+  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>;
+  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>;
+  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>;
+  defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>;
+  defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>;
+  defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>;
+
+  defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
+  defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
+  defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
+
+  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,    nxv8f16, nxv16i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,    nxv4f32, nxv16i1, nxv8f16, ElementSizeS>;
+  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, int_aarch64_sve_scvtf,          nxv8f16, nxv8i1,  nxv8i16, ElementSizeH>;
+  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, int_aarch64_sve_scvtf,          nxv4f32, nxv4i1,  nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, int_aarch64_sve_ucvtf,          nxv4f32, nxv4i1,  nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, int_aarch64_sve_ucvtf,          nxv8f16, nxv8i1,  nxv8i16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs,         nxv8i16, nxv8i1,  nxv8f16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs,         nxv4i32, nxv4i1,  nxv4f32, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu,         nxv8i16, nxv8i1,  nxv8f16, ElementSizeH>;
+  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu,         nxv4i32, nxv4i1,  nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,    nxv8f16, nxv16i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,    nxv2f64, nxv16i1, nxv8f16, ElementSizeD>;
+  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,    nxv4f32, nxv16i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,    nxv2f64, nxv16i1, nxv4f32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,   nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,   nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,   nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,   nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,   nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,   nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,   nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,   nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, int_aarch64_sve_scvtf,          nxv2f64, nxv2i1,  nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, int_aarch64_sve_ucvtf,          nxv2f64, nxv2i1,  nxv2i64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64,  nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64,  nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32,  nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16,  nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
+  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16,  nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16,  nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16,  nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32,  nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs,         nxv2i64, nxv2i1,  nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu,         nxv2i64, nxv2i1,  nxv2f64, ElementSizeD>;
 
   defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
   defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
@@ -1419,32 +1437,32 @@ let Predicates = [HasSVE2] in {
   defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
 
   // SVE2 floating-point base 2 logarithm as integer
-  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+  defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
 
   // SVE2 floating-point convert precision
-  defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
-  defm FCVTNT_ZPmZ  : sve2_fp_convert_down_narrow<"fcvtnt">;
-  defm FCVTLT_ZPmZ  : sve2_fp_convert_up_long<"fcvtlt">;
-  def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
+  defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
+  defm FCVTX_ZPmZ   : sve2_fp_convert_down_odd_rounding<"fcvtx",       "int_aarch64_sve_fcvtx">;
+  defm FCVTNT_ZPmZ  : sve2_fp_convert_down_narrow<"fcvtnt",            "int_aarch64_sve_fcvtnt">;
+  defm FCVTLT_ZPmZ  : sve2_fp_convert_up_long<"fcvtlt",                "int_aarch64_sve_fcvtlt">;
 
   // SVE2 floating-point pairwise operations
-  defm FADDP_ZPmZZ   : sve2_fp_pairwise_pred<0b000, "faddp">;
-  defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">;
-  defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">;
-  defm FMAXP_ZPmZZ   : sve2_fp_pairwise_pred<0b110, "fmaxp">;
-  defm FMINP_ZPmZZ   : sve2_fp_pairwise_pred<0b111, "fminp">;
+  defm FADDP_ZPmZZ   : sve2_fp_pairwise_pred<0b000, "faddp",   int_aarch64_sve_faddp>;
+  defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>;
+  defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>;
+  defm FMAXP_ZPmZZ   : sve2_fp_pairwise_pred<0b110, "fmaxp",   int_aarch64_sve_fmaxp>;
+  defm FMINP_ZPmZZ   : sve2_fp_pairwise_pred<0b111, "fminp",   int_aarch64_sve_fminp>;
 
   // SVE2 floating-point multiply-add long (indexed)
-  def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">;
-  def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">;
-  def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">;
-  def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">;
+  defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>;
+  defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>;
+  defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>;
+  defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>;
 
   // SVE2 floating-point multiply-add long
-  def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">;
-  def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">;
-  def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">;
-  def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">;
+  defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>;
+  defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>;
+  defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>;
+  defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;
 
   // SVE2 bitwise ternary operations
   defm EOR3_ZZZZ_D  : sve2_int_bitwise_ternary_op<0b000, "eor3">;
@@ -1493,15 +1511,16 @@ let Predicates = [HasSVE2] in {
   defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx">;
 
   // SVE2 integer compare scalar count and limit
-  defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
-  defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
-  defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
-  defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">;
-
-  defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">;
-  defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">;
-  defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
-  defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
+  defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", null_frag>;
+  defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", null_frag>;
+  defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", null_frag>;
+  defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", null_frag>;
+
+  defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", null_frag>;
+  defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", null_frag>;
+  defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", null_frag>;
+  defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", null_frag>;
+
 
   // SVE2 pointer conflict compare
   defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 908d72dbfc3eb..ddbddb9607d75 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -156,6 +156,12 @@ int AArch64TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   if (BitSize == 0)
     return TTI::TCC_Free;
 
+  // Most (all?) AArch64 intrinsics do not support folding immediates into the
+  // selected instruction, so we compute the materialization cost for the
+  // immediate directly.
+  if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
+    return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+
   switch (IID) {
   default:
     return TTI::TCC_Free;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 855510e7f5568..96a0117c9551a 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -304,8 +304,29 @@ class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, vt4:$Op4)),
       (inst $Op1, $Op2, $Op3, $Op4)>;
 
+class SVE_3_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                       ValueType vt2, ValueType vt3, Operand ImmTy,
+                       Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, (vt3 ImmTy:$Op3))),
+      (inst $Op1, $Op2, ImmTy:$Op3)>;
+
+class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                       ValueType vt2, ValueType vt3, ValueType vt4,
+                       Operand ImmTy, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
+      (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
+
 def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
 
+//
+// Common but less generic patterns.
+//
+
+class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                             Instruction inst, Instruction ptrue>
+: Pat<(vtd (op vt1:$Op1)),
+      (inst (IMPLICIT_DEF), (ptrue 31), $Op1)>;
+
 //===----------------------------------------------------------------------===//
 // SVE Predicate Misc Group
 //===----------------------------------------------------------------------===//
@@ -483,11 +504,17 @@ class sve_int_pcount_pred<bits<2> sz8_64, bits<4> opc, string asm,
   let Inst{4-0}   = Rd;
 }
 
-multiclass sve_int_pcount_pred<bits<4> opc, string asm> {
+multiclass sve_int_pcount_pred<bits<4> opc, string asm,
+                               SDPatternOperator int_op> {
   def _B : sve_int_pcount_pred<0b00, opc, asm, PPR8>;
   def _H : sve_int_pcount_pred<0b01, opc, asm, PPR16>;
   def _S : sve_int_pcount_pred<0b10, opc, asm, PPR32>;
   def _D : sve_int_pcount_pred<0b11, opc, asm, PPR64>;
+
+  def : SVE_2_Op_Pat<i64, int_op, nxv16i1, nxv16i1, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<i64, int_op, nxv8i1,  nxv8i1,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<i64, int_op, nxv4i1,  nxv4i1,  !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<i64, int_op, nxv2i1,  nxv2i1,  !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -512,13 +539,16 @@ class sve_int_count<bits<3> opc, string asm>
   let Inst{4-0}   = Rd;
 }
 
-multiclass sve_int_count<bits<3> opc, string asm> {
+multiclass sve_int_count<bits<3> opc, string asm, SDPatternOperator op> {
   def NAME : sve_int_count<opc, asm>;
 
   def : InstAlias<asm # "\t$Rd, $pattern",
                   (!cast<Instruction>(NAME) GPR64:$Rd, sve_pred_enum:$pattern, 1), 1>;
   def : InstAlias<asm # "\t$Rd",
                   (!cast<Instruction>(NAME) GPR64:$Rd, 0b11111, 1), 2>;
+
+  def : Pat<(i64 (op sve_pred_enum:$pattern)),
+            (!cast<Instruction>(NAME) sve_pred_enum:$pattern, 1)>;
 }
 
 class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
@@ -888,14 +918,18 @@ class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
 
   let Constraints = "$Zdn = $_Zdn";
   let DestructiveInstType = Destructive;
-  let ElementSize = ElementSizeNone;
 }
 
-multiclass sve_int_perm_insrs<string asm> {
+multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_insrs<0b00, asm, ZPR8, GPR32>;
   def _H : sve_int_perm_insrs<0b01, asm, ZPR16, GPR32>;
   def _S : sve_int_perm_insrs<0b10, asm, ZPR32, GPR32>;
   def _D : sve_int_perm_insrs<0b11, asm, ZPR64, GPR64>;
+
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, i32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, i32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -914,14 +948,17 @@ class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
 
   let Constraints = "$Zdn = $_Zdn";
   let DestructiveInstType = Destructive;
-  let ElementSize = ElementSizeNone;
 }
 
-multiclass sve_int_perm_insrv<string asm> {
+multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_insrv<0b00, asm, ZPR8, FPR8>;
   def _H : sve_int_perm_insrv<0b01, asm, ZPR16, FPR16>;
   def _S : sve_int_perm_insrv<0b10, asm, ZPR32, FPR32>;
   def _D : sve_int_perm_insrv<0b11, asm, ZPR64, FPR64>;
+
+  def : SVE_2_Op_Pat<nxv8f16, op, nxv8f16, f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4f32, op, nxv4f32, f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, f64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1032,6 +1069,16 @@ class sve_int_pred_log<bits<4> opc, string asm>
                       !strconcat(asm, "\t$Pd, $Pg/z, $Pn, $Pm"));
 
   let Defs = !if(!eq (opc{2}, 1), [NZCV], []);
+
+}
+
+multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op> {
+  def NAME : sve_int_pred_log<opc, asm>;
+
+  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
 }
 
 
@@ -1633,18 +1680,26 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
   let Constraints = "$Zd = $_Zd";
 }
 
-multiclass sve2_fp_convert_down_narrow<string asm> {
+multiclass sve2_fp_convert_down_narrow<string asm, string op> {
   def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
   def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
-multiclass sve2_fp_convert_up_long<string asm> {
+multiclass sve2_fp_convert_up_long<string asm, string op> {
   def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
   def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
+
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv16i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+  def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
-multiclass sve2_fp_convert_down_odd_rounding<string asm> {
+multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
   def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1674,10 +1729,14 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm> {
+multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm, SDPatternOperator op> {
   def _H : sve2_fp_pairwise_pred<0b01, opc, asm, ZPR16>;
   def _S : sve2_fp_pairwise_pred<0b10, opc, asm, ZPR32>;
   def _D : sve2_fp_pairwise_pred<0b11, opc, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1686,7 +1745,7 @@ multiclass sve2_fp_pairwise_pred<bits<3> opc, string asm> {
 
 class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
 : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm,
-                        VectorIndexH:$iop),
+                        VectorIndexH32b:$iop),
   asm, "\t$Zda, $Zn, $Zm$iop",
   "",
   []>, Sched<[]> {
@@ -1710,6 +1769,12 @@ class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
   let ElementSize = ElementSizeNone;
 }
 
+multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm,
+                                            SDPatternOperator op> {
+  def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b, !cast<Instruction>(NAME)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE2 Floating Point Widening Multiply-Add Group
 //===----------------------------------------------------------------------===//
@@ -1736,6 +1801,11 @@ class sve2_fp_mla_long<bits<2> opc, string asm>
   let ElementSize = ElementSizeNone;
 }
 
+multiclass sve2_fp_mla_long<bits<2> opc, string asm, SDPatternOperator op> {
+  def NAME : sve2_fp_mla_long<opc, asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, !cast<Instruction>(NAME)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE Stack Allocation Group
 //===----------------------------------------------------------------------===//
@@ -1830,6 +1900,16 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
   let ElementSize = size;
 }
 
+multiclass sve_fp_2op_p_zd<bits<7> opc, string asm,
+                           RegisterOperand i_zprtype,
+                           RegisterOperand o_zprtype,
+                           SDPatternOperator op, ValueType vt1,
+                           ValueType vt2, ValueType vt3, ElementSizeEnum Sz> {
+  def NAME : sve_fp_2op_p_zd<opc, asm, i_zprtype, o_zprtype, Sz>;
+
+  def : SVE_3_Op_Pat<vt1, op, vt1, vt2, vt3, !cast<Instruction>(NAME)>;
+}
+
 multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def _H : sve_fp_2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16, ElementSizeH>;
   def _S : sve_fp_2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32, ElementSizeS>;
@@ -1840,10 +1920,19 @@ multiclass sve_fp_2op_p_zd_HSD<bits<5> opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_fp_flogb<string asm> {
+multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
   def _H : sve_fp_2op_p_zd<0b0011010, asm, ZPR16, ZPR16, ElementSizeH>;
   def _S : sve_fp_2op_p_zd<0b0011100, asm, ZPR32, ZPR32, ElementSizeS>;
   def _D : sve_fp_2op_p_zd<0b0011110, asm, ZPR64, ZPR64, ElementSizeD>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
+  def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3560,7 +3649,8 @@ class sve_int_cterm<bit sz, bit opc, string asm, RegisterClass rt>
 }
 
 class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
-                       RegisterClass gprty, PPRRegOp pprty>
+                       RegisterClass gprty, PPRRegOp pprty,
+                       ValueType vt, SDPatternOperator op>
 : I<(outs pprty:$Pd), (ins gprty:$Rn, gprty:$Rm),
   asm, "\t$Pd, $Rn, $Rm",
   "", []>, Sched<[]> {
@@ -3580,18 +3670,28 @@ class sve_int_while_rr<bits<2> sz8_64, bits<4> opc, string asm,
   let Defs = [NZCV];
 }
 
-multiclass sve_int_while4_rr<bits<3> opc, string asm> {
-  def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8>;
-  def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16>;
-  def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32>;
-  def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64>;
+multiclass sve_int_while4_rr<bits<3> opc, string asm, SDPatternOperator op> {
+  def _B : sve_int_while_rr<0b00, { 0, opc }, asm, GPR32, PPR8, nxv16i1, op>;
+  def _H : sve_int_while_rr<0b01, { 0, opc }, asm, GPR32, PPR16, nxv8i1, op>;
+  def _S : sve_int_while_rr<0b10, { 0, opc }, asm, GPR32, PPR32, nxv4i1, op>;
+  def _D : sve_int_while_rr<0b11, { 0, opc }, asm, GPR32, PPR64, nxv2i1, op>;
+
+  def : SVE_2_Op_Pat<nxv16i1, op, i32, i32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i1, op, i32, i32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1, op, i32, i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1, op, i32, i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_while8_rr<bits<3> opc, string asm> {
-  def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8>;
-  def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16>;
-  def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32>;
-  def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64>;
+multiclass sve_int_while8_rr<bits<3> opc, string asm, SDPatternOperator op> {
+  def _B : sve_int_while_rr<0b00, { 1, opc }, asm, GPR64, PPR8, nxv16i1, op>;
+  def _H : sve_int_while_rr<0b01, { 1, opc }, asm, GPR64, PPR16, nxv8i1, op>;
+  def _S : sve_int_while_rr<0b10, { 1, opc }, asm, GPR64, PPR32, nxv4i1, op>;
+  def _D : sve_int_while_rr<0b11, { 1, opc }, asm, GPR64, PPR64, nxv2i1, op>;
+
+  def : SVE_2_Op_Pat<nxv16i1, op, i64, i64, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i1, op, i64, i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1, op, i64, i64, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1, op, i64, i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
@@ -3906,7 +4006,8 @@ multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
   }
 }
 
-multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm> {
+multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm,
+                                            SDPatternOperator op = null_frag> {
   def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
                                       ElementSizeB>;
   def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
@@ -3922,6 +4023,11 @@ multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm> {
     let Inst{22}  = imm{5};
     let Inst{9-8} = imm{4-3};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
@@ -3948,17 +4054,28 @@ class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_bin_pred_shift<bits<3> opc, string asm> {
+multiclass sve_int_bin_pred_shift<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
   def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
   def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
   def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
   def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm> {
+multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
+                                  SDPatternOperator op> {
   def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
   def _H : sve_int_bin_pred_shift<0b01, 0b1, opc, asm, ZPR16, ZPR64>;
   def _S : sve_int_bin_pred_shift<0b10, 0b1, opc, asm, ZPR32, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv2i64, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv2i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv2i64, !cast<Instruction>(NAME # _S)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4759,26 +4876,46 @@ class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_perm_rev_rbit<string asm> {
+multiclass sve_int_perm_rev_rbit<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_rev<0b00, 0b11, asm, ZPR8>;
   def _H : sve_int_perm_rev<0b01, 0b11, asm, ZPR16>;
   def _S : sve_int_perm_rev<0b10, 0b11, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b11, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_perm_rev_revb<string asm> {
+multiclass sve_int_perm_rev_revb<string asm,
+                                 SDPatternOperator int_op,
+                                 SDPatternOperator ir_op> {
   def _H : sve_int_perm_rev<0b01, 0b00, asm, ZPR16>;
   def _S : sve_int_perm_rev<0b10, 0b00, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b00, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_1_Op_AllActive_Pat<nxv8i16, ir_op, nxv8i16, !cast<Instruction>(NAME # _H), PTRUE_H>;
+  def : SVE_1_Op_AllActive_Pat<nxv4i32, ir_op, nxv4i32, !cast<Instruction>(NAME # _S), PTRUE_S>;
+  def : SVE_1_Op_AllActive_Pat<nxv2i64, ir_op, nxv2i64, !cast<Instruction>(NAME # _D), PTRUE_D>;
 }
 
-multiclass sve_int_perm_rev_revh<string asm> {
+multiclass sve_int_perm_rev_revh<string asm, SDPatternOperator op> {
   def _S : sve_int_perm_rev<0b10, 0b01, asm, ZPR32>;
   def _D : sve_int_perm_rev<0b11, 0b01, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_perm_rev_revw<string asm> {
+multiclass sve_int_perm_rev_revw<string asm, SDPatternOperator op> {
   def _D : sve_int_perm_rev<0b11, 0b10, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -5215,8 +5352,11 @@ class sve_mem_32b_gld_sv<bits<4> opc, bit xs, bit scaled, string asm,
 }
 
 multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
+                                        SDPatternOperator sxtw_op,
+                                        SDPatternOperator uxtw_op,
                                         RegisterOperand sxtw_opnd,
-                                        RegisterOperand uxtw_opnd> {
+                                        RegisterOperand uxtw_opnd,
+                                        ValueType vt> {
   def _UXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 0, 1, asm, uxtw_opnd>;
   def _SXTW_SCALED_REAL : sve_mem_32b_gld_sv<opc, 1, 1, asm, sxtw_opnd>;
 
@@ -5224,11 +5364,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
                   (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+  def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
+            (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+  def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
+            (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
+                                          SDPatternOperator sxtw_op,
+                                          SDPatternOperator uxtw_op,
                                           RegisterOperand sxtw_opnd,
-                                          RegisterOperand uxtw_opnd> {
+                                          RegisterOperand uxtw_opnd,
+                                          ValueType vt> {
   def _UXTW_REAL : sve_mem_32b_gld_sv<opc, 0, 0, asm, uxtw_opnd>;
   def _SXTW_REAL : sve_mem_32b_gld_sv<opc, 1, 0, asm, sxtw_opnd>;
 
@@ -5236,6 +5384,11 @@ multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
                   (!cast<Instruction>(NAME # _UXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+  def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
+            (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+  def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
+            (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 
@@ -5263,7 +5416,8 @@ class sve_mem_32b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
   let Uses = !if(!eq(opc{0}, 1), [FFR], []);
 }
 
-multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
+                                      SDPatternOperator op, ValueType vt> {
   def _IMM_REAL : sve_mem_32b_gld_vi<opc, asm, imm_ty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
@@ -5272,6 +5426,9 @@ multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty> {
                   (!cast<Instruction>(NAME # _IMM_REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                   (!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+
+  def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
+            (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
 }
 
 class sve_mem_prfm_si<bits<2> msz, string asm>
@@ -5507,8 +5664,11 @@ class sve_mem_64b_gld_sv<bits<4> opc, bit xs, bit scaled, bit lsl, string asm,
 }
 
 multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
+                                        SDPatternOperator sxtw_op,
+                                        SDPatternOperator uxtw_op,
                                         RegisterOperand sxtw_opnd,
-                                        RegisterOperand uxtw_opnd> {
+                                        RegisterOperand uxtw_opnd,
+                                        ValueType vt> {
   def _UXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 0, 1, 0, asm, uxtw_opnd>;
   def _SXTW_SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 0, asm, sxtw_opnd>;
 
@@ -5516,11 +5676,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
                   (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+  def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
+            (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+  def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
+            (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
+                                          SDPatternOperator sxtw_op,
+                                          SDPatternOperator uxtw_op,
                                           RegisterOperand sxtw_opnd,
-                                          RegisterOperand uxtw_opnd> {
+                                          RegisterOperand uxtw_opnd,
+                                          ValueType vt> {
   def _UXTW_REAL : sve_mem_64b_gld_sv<opc, 0, 0, 0, asm, uxtw_opnd>;
   def _SXTW_REAL : sve_mem_64b_gld_sv<opc, 1, 0, 0, asm, sxtw_opnd>;
 
@@ -5528,21 +5696,34 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
                   (!cast<Instruction>(NAME # _UXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
+
+  def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
+            (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+  def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
+            (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
-                                         RegisterOperand zprext> {
+                                         SDPatternOperator op,
+                                         RegisterOperand zprext, ValueType vt> {
   def _SCALED_REAL : sve_mem_64b_gld_sv<opc, 1, 1, 1, asm, zprext>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
+
+  def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
+                     (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
-multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm> {
+multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
+                                           SDPatternOperator op, ValueType vt> {
   def _REAL : sve_mem_64b_gld_sv<opc, 1, 0, 1, asm, ZPR64ExtLSL8>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
+
+  def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
+            (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
@@ -5569,7 +5750,8 @@ class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
   let Uses = !if(!eq(opc{0}, 1), [FFR], []);
 }
 
-multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
+multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
+                                      SDPatternOperator op, ValueType vt> {
   def _IMM_REAL : sve_mem_64b_gld_vi<opc, asm, imm_ty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
@@ -5578,6 +5760,9 @@ multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty> {
                  (!cast<Instruction>(NAME # _IMM_REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                   (!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
+
+  def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
+            (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
 }
 
 // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7a4fcac09ec4d..57c126fe6494b 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -643,6 +643,17 @@ namespace AArch64II {
   };
 } // end namespace AArch64II
 
+namespace AArch64 {
+// The number of bits in a SVE register is architecturally defined
+// to be a multiple of this value.  If <M x t> has this number of bits,
+// a <n x M x t> vector can be stored in a SVE register without any
+// redundant bits.  If <M x t> has this number of bits divided by P,
+// a <n x M x t> vector is stored in a SVE register by placing index i
+// in index i*P of a <n x (M*P) x t> vector.  The other elements of the
+// <n x (M*P) x t> vector (such as index 1) are undefined.
+static constexpr unsigned SVEBitsPerBlock = 128;
+} // end namespace AArch64
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 85d1ad3491573..ae87cf08275f0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -32,7 +32,13 @@ enum PartialMappingIdx {
   PM_VGPR512 = 22,
   PM_VGPR1024 = 23,
   PM_SGPR96 = 24,
-  PM_VGPR96 = 25
+  PM_VGPR96 = 25,
+  PM_AGPR96 = 26,
+  PM_AGPR32 = 32,
+  PM_AGPR64 = 33,
+  PM_AGPR128 = 34,
+  PM_AGPR512 = 36,
+  PM_AGPR1024 = 37
 };
 
 const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -58,7 +64,14 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
   {0, 512, VGPRRegBank},
   {0, 1024, VGPRRegBank},
   {0, 96, SGPRRegBank},
-  {0, 96, VGPRRegBank}
+  {0, 96, VGPRRegBank},
+  {0, 96, AGPRRegBank},
+
+  {0, 32, AGPRRegBank}, // AGPR begin
+  {0, 64, AGPRRegBank},
+  {0, 128, AGPRRegBank},
+  {0, 512, AGPRRegBank},
+  {0, 1024, AGPRRegBank}
 };
 
 const RegisterBankInfo::ValueMapping ValMappings[] {
@@ -94,7 +107,21 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
   {&PartMappings[16], 1}, // 512
   {&PartMappings[17], 1}, // 1024
   {&PartMappings[18], 1},
-  {&PartMappings[19], 1}
+  {&PartMappings[19], 1},
+  {&PartMappings[20], 1},
+
+  // AGPRs
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
+  {nullptr, 0},
+  {&PartMappings[21], 1}, // 32
+  {&PartMappings[22], 1}, // 64
+  {&PartMappings[23], 1}, // 128
+  {nullptr, 0},
+  {&PartMappings[24], 1}, // 512
+  {&PartMappings[25], 1}  // 1024
 };
 
 const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
@@ -122,7 +149,8 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
 enum ValueMappingIdx {
   SCCStartIdx = 0,
   SGPRStartIdx = 2,
-  VGPRStartIdx = 13
+  VGPRStartIdx = 13,
+  AGPRStartIdx = 27
 };
 
 const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -139,12 +167,32 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
     Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR1 : PM_VGPR1;
     break;
   case 96:
-    assert(BankID != AMDGPU::VCCRegBankID);
-    Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR96 : PM_VGPR96;
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = PM_VGPR96;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = PM_SGPR96;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = PM_AGPR96;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
     break;
   default:
-    assert(BankID != AMDGPU::VCCRegBankID);
-    Idx = BankID == AMDGPU::VGPRRegBankID ? VGPRStartIdx : SGPRStartIdx;
+    switch (BankID) {
+      case AMDGPU::VGPRRegBankID:
+        Idx = VGPRStartIdx;
+        break;
+      case AMDGPU::SGPRRegBankID:
+        Idx = SGPRStartIdx;
+        break;
+      case AMDGPU::AGPRRegBankID:
+        Idx = AGPRStartIdx;
+        break;
+      default: llvm_unreachable("Invalid register bank");
+    }
     Idx += Log2_32_Ceil(Size);
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 8dae8b6c932ef..a51d3d74c899f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -106,6 +106,14 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
   (void)RBVGPR;
   assert(&RBVGPR == &AMDGPU::VGPRRegBank);
 
+  const RegisterBank &RBAGPR = getRegBank(AMDGPU::AGPRRegBankID);
+  (void)RBAGPR;
+  assert(&RBAGPR == &AMDGPU::AGPRRegBank);
+}
+
+static bool isVectorRegisterBank(const RegisterBank &Bank) {
+  unsigned BankID = Bank.getID();
+  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
 }
 
 unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
@@ -113,7 +121,7 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
                                           unsigned Size) const {
   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
-      Src.getID() == AMDGPU::VGPRRegBankID) {
+      isVectorRegisterBank(Src)) {
     return std::numeric_limits<unsigned>::max();
   }
 
@@ -127,8 +135,8 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
   if (Size == 1 &&
       (Dst.getID() == AMDGPU::SCCRegBankID ||
        Dst.getID() == AMDGPU::SGPRRegBankID) &&
-      (Src.getID() == AMDGPU::SGPRRegBankID ||
-       Src.getID() == AMDGPU::VGPRRegBankID ||
+      (isVectorRegisterBank(Src) ||
+       Src.getID() == AMDGPU::SGPRRegBankID ||
        Src.getID() == AMDGPU::VCCRegBankID))
     return std::numeric_limits<unsigned>::max();
 
@@ -136,6 +144,11 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
       Src.getID() == AMDGPU::VCCRegBankID)
     return std::numeric_limits<unsigned>::max();
 
+  // There is no direct copy between AGPRs.
+  if (Dst.getID() == AMDGPU::AGPRRegBankID &&
+      Src.getID() == AMDGPU::AGPRRegBankID)
+    return 4;
+
   return RegisterBankInfo::copyCost(Dst, Src, Size);
 }
 
@@ -169,7 +182,12 @@ const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
   if (&RC == &AMDGPU::SReg_1RegClass)
     return AMDGPU::VCCRegBank;
 
-  return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
+  if (TRI->isSGPRClass(&RC))
+    return AMDGPU::SGPRRegBank;
+  if (TRI->isAGPRClass(&RC))
+    return AMDGPU::AGPRRegBank;
+
+  return AMDGPU::VGPRRegBank;
 }
 
 template <unsigned NumOps>
@@ -1908,7 +1926,7 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
       continue;
     Register Reg = MI.getOperand(i).getReg();
     if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
-      if (Bank->getID() == AMDGPU::VGPRRegBankID)
+      if (isVectorRegisterBank(*Bank))
         return false;
 
       assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
@@ -2072,7 +2090,6 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg,
                                      const MachineRegisterInfo &MRI,
                                      const TargetRegisterInfo &TRI,
                                      unsigned Default) const {
-
   const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
   return Bank ? Bank->getID() : Default;
 }
@@ -2102,6 +2119,14 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
   return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
 }
 
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) const {
+  unsigned Size = getSizeInBits(Reg, MRI, TRI);
+  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
+}
+
 ///
 /// This function must return a legal mapping, because
 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
@@ -2725,6 +2750,38 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
+    case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
+    case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
+    case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
+    case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
+    case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
+    case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
+    case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
+    case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
+    case Intrinsic::amdgcn_mfma_f32_32x32x4bf16: {
+      // Default for MAI intrinsics.
+      // srcC can also be an immediate which can be folded later.
+      // FIXME: Should we eventually add an alternative mapping with AGPR src
+      // for srcA/srcB?
+      //
+      // vdst, srcA, srcB, srcC
+      OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index a14b74961118a..9549e444ade54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -103,6 +103,11 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
                                        const MachineRegisterInfo &MRI,
                                        const TargetRegisterInfo &TRI) const;
 
+  // Return a value mapping for an operand that is required to be a AGPR.
+  const ValueMapping *getAGPROpMapping(Register Reg,
+                                       const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI) const;
+
   /// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
   /// Regs. This appropriately sets the regbank of the new registers.
   void split64BitValueForMapping(MachineIRBuilder &B,
@@ -131,6 +136,7 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
       const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
 
   bool isSALUMapping(const MachineInstr &MI) const;
+
   const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
   const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
   const InstructionMapping &getDefaultMappingAllVGPR(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 00f53b1575770..ab3b176ac2147 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -18,3 +18,7 @@ def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
 def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
+
+def AGPRRegBank : RegisterBank <"AGPR",
+  [AGPR_32, AReg_64, AReg_128, AReg_512, AReg_1024]
+>;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4ae981581027f..9388592c88734 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -304,15 +304,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address);
       if (Res) { IsSDWA = true;  break; }
 
-      // Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
-      // v_mad_mixhi_f16 for FMA variants. Try to decode using this special
-      // table first so we print the correct name.
-
-      if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
-        Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
-        if (Res) break;
-      }
-
       if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
         Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
         if (Res)
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index ca17ba8b7229c..f2c00ddce94c3 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -429,6 +429,29 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
     return true;
   }
 
+  // Check the case where we might introduce a second constant operand to a
+  // scalar instruction
+  if (TII->isSALU(MI->getOpcode())) {
+    const MCInstrDesc &InstDesc = MI->getDesc();
+    const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
+    const SIRegisterInfo &SRI = TII->getRegisterInfo();
+
+    // Fine if the operand can be encoded as an inline constant
+    if (OpToFold->isImm()) {
+      if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
+          !TII->isInlineConstant(*OpToFold, OpInfo)) {
+        // Otherwise check for another constant
+        for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
+          auto &Op = MI->getOperand(i);
+          if (OpNo != i &&
+              TII->isLiteralConstantLike(Op, OpInfo)) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ed915f03be217..5e39e7c119bc4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6211,7 +6211,11 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
   if (ST.hasAddNoCarry())
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
 
-  Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+  // If available, prefer to use vcc.
+  Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
+                             ? Register(RI.getVCC())
+                             : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+
   // TODO: Users need to deal with this.
   if (!UnusedCarry.isValid())
     return MachineInstrBuilder();
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 6f9abd3a8d9b9..bf052dc3c9304 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -372,12 +372,15 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   // exit" mask.
   MachineInstr *And = nullptr, *Or = nullptr;
   if (!SkipAnding) {
-    And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Dst)
+    Register AndReg = MRI->createVirtualRegister(BoolRC);
+    And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
              .addReg(Exec)
              .add(MI.getOperand(1));
     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
-             .addReg(Dst)
+             .addReg(AndReg)
              .add(MI.getOperand(2));
+    if (LIS)
+      LIS->createAndComputeVirtRegInterval(AndReg);
   } else
     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
              .add(MI.getOperand(1))
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 9b3b2436475ce..05c81feb23ecd 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -26,6 +26,7 @@
 #include "SIRegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
@@ -73,8 +74,8 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
-  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
-  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+  MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+  MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
   SmallVector<MachineInstr *, 8> ConvertedInstructions;
 
   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index efcc7266316e0..5796c6e6a112c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1123,11 +1123,15 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
               if (!IsVOP2)
                 MIB.addImm(0); // clamp bit
             } else {
-              Register ConstOffsetReg =
-                RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+              assert(MIB->getOpcode() == AMDGPU::V_ADD_I32_e64 &&
+                     "Need to reuse carry out register");
 
-              // This should always be able to use the unused carry out.
-              assert(ConstOffsetReg && "this scavenge should not be able to fail");
+              // Use scavenged unused carry out as offset register.
+              Register ConstOffsetReg;
+              if (!isWave32)
+                ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
+              else
+                ConstOffsetReg = MIB.getReg(1);
 
               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
                 .addImm(Offset);
@@ -1136,10 +1140,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
               MIB.addImm(0); // clamp bit
             }
           } else {
-            // We have to produce a carry out, and we there isn't a free SGPR
-            // pair for it. We can keep the whole computation on the SALU to
-            // avoid clobbering an additional register at the cost of an extra
-            // mov.
+            // We have to produce a carry out, and there isn't a free SGPR pair
+            // for it. We can keep the whole computation on the SALU to avoid
+            // clobbering an additional register at the cost of an extra mov.
 
             // We may have 1 free scratch SGPR even though a carry out is
             // unavailable. Only one additional mov is needed.
@@ -1161,9 +1164,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
                 .addReg(ScaledReg, RegState::Kill)
                 .addImm(Offset);
-            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
-              .addReg(DiffReg, RegState::Kill)
-              .addImm(ST.getWavefrontSizeLog2());
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+                .addReg(DiffReg, RegState::Kill)
+                .addImm(ST.getWavefrontSizeLog2());
             }
           }
         }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ac3dea1a1a281..ac8c56fa3a038 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -144,6 +144,11 @@ class SIRegisterInfo final : public AMDGPURegisterInfo {
     return isSGPRClass(RC);
   }
 
+  /// \returns true if this class contains only AGPR registers
+  bool isAGPRClass(const TargetRegisterClass *RC) const {
+    return hasAGPRs(RC) && !hasVGPRs(RC);
+  }
+
   /// \returns true if this class contains VGPR registers.
   bool hasVGPRs(const TargetRegisterClass *RC) const;
 
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 2e6f756d522c8..9076c191d8397 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -43,7 +43,6 @@ FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
 FunctionPass *createA15SDOptimizerPass();
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
-FunctionPass *createARMCodeGenPreparePass();
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
@@ -61,7 +60,6 @@ void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
 void initializeARMParallelDSPPass(PassRegistry &);
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
-void initializeARMCodeGenPreparePass(PassRegistry &);
 void initializeARMConstantIslandsPass(PassRegistry &);
 void initializeARMExpandPseudoPass(PassRegistry &);
 void initializeThumb2SizeReducePass(PassRegistry &);
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 285dad1cf29a3..66bfd4c82e25c 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -391,11 +391,9 @@ def FeatureExecuteOnly    : SubtargetFeature<"execute-only",
                                              "Enable the generation of "
                                              "execute only code.">;
 
-foreach i = {6-11} in
-    def FeatureReserveR#i : SubtargetFeature<"reserve-r"#i,
-                                             "ReservedGPRegisters["#i#"]", "true",
-                                             "Reserve R"#i#", making it "
-                                             "unavailable as a GPR">;
+def FeatureReserveR9      : SubtargetFeature<"reserve-r9", "ReserveR9", "true",
+                                             "Reserve R9, making it unavailable"
+                                             " as GPR">;
 
 def FeatureNoMovt         : SubtargetFeature<"no-movt", "NoMovt", "true",
                                              "Don't use movt/movw pairs for "
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 10153dd2e3950..ed0969fa625b0 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -752,7 +752,7 @@ void ARMAsmPrinter::emitAttributes() {
   if (STI.isRWPI())
     ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
                       ARMBuildAttrs::R9IsSB);
-  else if (STI.isGPRegisterReserved(9))
+  else if (STI.isR9Reserved())
     ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_R9_use,
                       ARMBuildAttrs::R9Reserved);
   else
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index ef10c9f738ef8..cecc16ffccba8 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2723,25 +2723,6 @@ static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
   return false;
 }
 
-/// getSwappedCondition - assume the flags are set by MI(a,b), return
-/// the condition code if we modify the instructions such that flags are
-/// set by MI(b,a).
-inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
-  switch (CC) {
-  default: return ARMCC::AL;
-  case ARMCC::EQ: return ARMCC::EQ;
-  case ARMCC::NE: return ARMCC::NE;
-  case ARMCC::HS: return ARMCC::LS;
-  case ARMCC::LO: return ARMCC::HI;
-  case ARMCC::HI: return ARMCC::LO;
-  case ARMCC::LS: return ARMCC::HS;
-  case ARMCC::GE: return ARMCC::LE;
-  case ARMCC::LT: return ARMCC::GT;
-  case ARMCC::GT: return ARMCC::LT;
-  case ARMCC::LE: return ARMCC::GE;
-  }
-}
-
 /// getCmpToAddCondition - assume the flags are set by CMP(a,b), return
 /// the condition code if we modify the instructions such that flags are
 /// set by ADD(a,b,X).
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index afcdb648cbc8f..4ace52b32e9ff 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -198,11 +198,9 @@ getReservedRegs(const MachineFunction &MF) const {
     markSuperRegs(Reserved, getFramePointerReg(STI));
   if (hasBasePointer(MF))
     markSuperRegs(Reserved, BasePtr);
-  for (size_t R = 0; R < ARM::GPRRegClass.getNumRegs(); ++R) {
-    if (STI.isGPRegisterReserved(R)) {
-      markSuperRegs(Reserved, ARM::R0 + R);
-    }
-  }
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    markSuperRegs(Reserved, ARM::R9);
   // Reserve D16-D31 if the subtarget doesn't support them.
   if (!STI.hasD32()) {
     static_assert(ARM::D31 == ARM::D16 + 15, "Register list not consecutive!");
@@ -282,7 +280,7 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case ARM::GPRRegClassID: {
     bool HasFP = MF.getFrameInfo().isMaxCallFrameSizeComputed()
                  ? TFI->hasFP(MF) : true;
-    return 10 - HasFP - STI.getNumGPRegistersReserved();
+    return 10 - HasFP - (STI.isR9Reserved() ? 1 : 0);
   }
   case ARM::SPRRegClassID:  // Currently not used as 'rep' register class.
   case ARM::DPRRegClassID:
@@ -382,11 +380,6 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMFrameLowering *TFI = getFrameLowering(MF);
-  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
-
-  // Disable base pointer R6 if -ffixed-r6 is used.
-  if (STI.isGPRegisterReserved(BasePtr - ARM::R0))
-    return false;
 
   // If we have stack realignment and VLAs, we have no pointer to use to
   // access the stack. If we have stack realignment, and a large call frame,
@@ -423,7 +416,6 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const ARMFrameLowering *TFI = getFrameLowering(MF);
-  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   // We can't realign the stack if:
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. There are VLAs in the function and the base pointer is disabled.
@@ -433,9 +425,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // register allocation with frame pointer elimination, it is too late now.
   if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>())))
     return false;
-  // Disable base pointer R6 if -ffixed-r6 is used.
-  if (STI.isGPRegisterReserved(BasePtr - ARM::R0))
-    return false;
   // We may also need a base pointer if there are dynamic allocas or stack
   // pointer adjustments around calls.
   if (TFI->hasReservedCallFrame(MF))
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 24ca25f73e96d..634fb89b8e893 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -1917,6 +1917,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
 
     MachineInstrBuilder MIB = BuildMI(*MBB, Br.MI, Br.MI->getDebugLoc(),
                                       TII->get(ARM::t2LE));
+    // Swapped a t2Bcc for a t2LE, so no need to update the size of the block.
     MIB.add(Br.MI->getOperand(0));
     Br.MI->eraseFromParent();
     Br.MI = MIB;
@@ -1975,21 +1976,20 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
             .addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
 
     Cmp.MI->eraseFromParent();
-    BBInfoVector &BBInfo = BBUtils->getBBInfo();
-    BBInfo[MBB->getNumber()].Size -= 2;
 
     if (Br.MI->getOpcode() == ARM::tBcc) {
       Br.MI->eraseFromParent();
       Br.MI = NewBR;
-    } else if (&MBB->back() != Br.MI) {
-      // We've generated an LE and already erased the original conditional
-      // branch. The CBN?Z is now used to branch to the other successor, so an
-      // unconditional branch terminator is now redundant.
+      BBUtils->adjustBBSize(MBB, -2);
+    } else if (MBB->back().getOpcode() != ARM::t2LE) {
+      // An LE has been generated, but it's not the terminator - that is an
+      // unconditional branch. However, the logic has now been reversed with the
+      // CBN?Z being the conditional branch and the LE being the unconditional
+      // branch. So this means we can remove the redundant unconditional branch
+      // at the end of the block.
       MachineInstr *LastMI = &MBB->back();
-      if (LastMI != Br.MI) {
-        BBInfo[MBB->getNumber()].Size -= LastMI->getDesc().getSize();
-        LastMI->eraseFromParent();
-      }
+      BBUtils->adjustBBSize(MBB, -LastMI->getDesc().getSize());
+      LastMI->eraseFromParent();
     }
     BBUtils->adjustBBOffsetsAfter(MBB);
     ++NumCBZ;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 563fdda561049..de4377ec5a471 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1213,9 +1213,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MBBI = NewMI;
       return true;
     }
+    case ARM::VMOVHcc:
     case ARM::VMOVScc:
     case ARM::VMOVDcc: {
-      unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
+      unsigned newOpc = Opcode != ARM::VMOVDcc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
           .add(MI.getOperand(2))
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 106894e28f033..5428bd6c94b35 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1704,19 +1704,6 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    if (STI.isRWPI() && Reg == ARM::R9) {
-      // Paranoid check for use of R9 with RWPI. Clobbering R9 with -frwpi will
-      // emit warnings about undefined behaviour but maybe theres's a valid use
-      // case so on that basis allow it to be pushed/popped in the
-      // prologue/epilogue.
-    } else if (Reg > ARM::R0 && ARM::GPRRegClass.contains(Reg) &&
-               STI.isGPRegisterReserved(Reg - ARM::R0)) {
-      LLVM_DEBUG(dbgs() << printReg(Reg, TRI) << " has been reserved and"
-                        << " should not be allocatable"
-                        << " or spillable.\n");
-      SavedRegs.reset(Reg);
-      continue;
-    }
     bool Spilled = false;
     if (SavedRegs.test(Reg)) {
       Spilled = true;
@@ -1961,7 +1948,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
           LLVM_DEBUG(dbgs() << printReg(Reg, TRI)
                             << " is saved low register, RegDeficit = "
                             << RegDeficit << "\n");
-        } else if (!STI.isGPRegisterReserved(Reg - ARM::R0)) {
+        } else {
           AvailableRegs.push_back(Reg);
           LLVM_DEBUG(
               dbgs()
@@ -1976,7 +1963,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
           --RegDeficit;
           LLVM_DEBUG(dbgs() << "%r7 is saved low register, RegDeficit = "
                             << RegDeficit << "\n");
-        } else if (!STI.isGPRegisterReserved(7)) {
+        } else {
           AvailableRegs.push_back(ARM::R7);
           LLVM_DEBUG(
               dbgs()
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 46a2560e16745..a6b334938e179 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1351,11 +1351,27 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm7Offset(SDNode *Op, SDValue N,
                                                  SDValue &OffImm,
                                                  unsigned Shift) {
   unsigned Opcode = Op->getOpcode();
-  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
-                               ? cast<LoadSDNode>(Op)->getAddressingMode()
-                               : cast<StoreSDNode>(Op)->getAddressingMode();
+  ISD::MemIndexedMode AM;
+  switch (Opcode) {
+  case ISD::LOAD:
+    AM = cast<LoadSDNode>(Op)->getAddressingMode();
+    break;
+  case ISD::STORE:
+    AM = cast<StoreSDNode>(Op)->getAddressingMode();
+    break;
+  case ISD::MLOAD:
+    AM = cast<MaskedLoadSDNode>(Op)->getAddressingMode();
+    break;
+  case ISD::MSTORE:
+    AM = cast<MaskedStoreSDNode>(Op)->getAddressingMode();
+    break;
+  default:
+    llvm_unreachable("Unexpected Opcode for Imm7Offset");
+  }
+
   int RHSC;
-  if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) { // 7 bits.
+  // 7 bit constant, shifted by Shift.
+  if (isScaledConstantInRange(N, 1 << Shift, 0, 0x80, RHSC)) {
     OffImm =
         ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
             ? CurDAG->getTargetConstant(RHSC * (1 << Shift), SDLoc(N), MVT::i32)
@@ -1625,58 +1641,93 @@ bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
 }
 
 bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  if (AM == ISD::UNINDEXED)
-    return false;
-  EVT LoadedVT = LD->getMemoryVT();
-  if (!LoadedVT.isVector())
-    return false;
-  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
-  SDValue Offset;
-  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  EVT LoadedVT;
   unsigned Opcode = 0;
-  unsigned Align = LD->getAlignment();
-  bool IsLE = Subtarget->isLittle();
+  bool isSExtLd, isPre;
+  unsigned Align;
+  ARMVCC::VPTCodes Pred;
+  SDValue PredReg;
+  SDValue Chain, Base, Offset;
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    ISD::MemIndexedMode AM = LD->getAddressingMode();
+    if (AM == ISD::UNINDEXED)
+      return false;
+    LoadedVT = LD->getMemoryVT();
+    if (!LoadedVT.isVector())
+      return false;
+
+    Chain = LD->getChain();
+    Base = LD->getBasePtr();
+    Offset = LD->getOffset();
+    Align = LD->getAlignment();
+    isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+    isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+    Pred = ARMVCC::None;
+    PredReg = CurDAG->getRegister(0, MVT::i32);
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+    ISD::MemIndexedMode AM = LD->getAddressingMode();
+    if (AM == ISD::UNINDEXED)
+      return false;
+    LoadedVT = LD->getMemoryVT();
+    if (!LoadedVT.isVector())
+      return false;
 
+    Chain = LD->getChain();
+    Base = LD->getBasePtr();
+    Offset = LD->getOffset();
+    Align = LD->getAlignment();
+    isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+    isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+    Pred = ARMVCC::Then;
+    PredReg = LD->getMask();
+  } else
+    llvm_unreachable("Expected a Load or a Masked Load!");
+
+  // We allow LE non-masked loads to change the type (for example use a vldrb.8
+  // as opposed to a vldrw.32). This can allow extra addressing modes or
+  // alignments for what is otherwise an equivalent instruction.
+  bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N);
+
+  SDValue NewOffset;
   if (Align >= 2 && LoadedVT == MVT::v4i16 &&
-      SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1)) {
+      SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) {
     if (isSExtLd)
       Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
     else
       Opcode = isPre ? ARM::MVE_VLDRHU32_pre : ARM::MVE_VLDRHU32_post;
   } else if (LoadedVT == MVT::v8i8 &&
-             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+             SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
     if (isSExtLd)
       Opcode = isPre ? ARM::MVE_VLDRBS16_pre : ARM::MVE_VLDRBS16_post;
     else
       Opcode = isPre ? ARM::MVE_VLDRBU16_pre : ARM::MVE_VLDRBU16_post;
   } else if (LoadedVT == MVT::v4i8 &&
-             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0)) {
+             SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0)) {
     if (isSExtLd)
       Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
     else
       Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
   } else if (Align >= 4 &&
-             (IsLE || LoadedVT == MVT::v4i32 || LoadedVT == MVT::v4f32) &&
-             SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 2))
+             (CanChangeType || LoadedVT == MVT::v4i32 ||
+              LoadedVT == MVT::v4f32) &&
+             SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2))
     Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
   else if (Align >= 2 &&
-           (IsLE || LoadedVT == MVT::v8i16 || LoadedVT == MVT::v8f16) &&
-           SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 1))
+           (CanChangeType || LoadedVT == MVT::v8i16 ||
+            LoadedVT == MVT::v8f16) &&
+           SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1))
     Opcode = isPre ? ARM::MVE_VLDRHU16_pre : ARM::MVE_VLDRHU16_post;
-  else if ((IsLE || LoadedVT == MVT::v16i8) &&
-           SelectT2AddrModeImm7Offset(N, LD->getOffset(), Offset, 0))
+  else if ((CanChangeType || LoadedVT == MVT::v16i8) &&
+           SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 0))
     Opcode = isPre ? ARM::MVE_VLDRBU8_pre : ARM::MVE_VLDRBU8_post;
   else
     return false;
 
-  SDValue Chain = LD->getChain();
-  SDValue Base = LD->getBasePtr();
-  SDValue Ops[] = {Base, Offset,
-                   CurDAG->getTargetConstant(ARMVCC::None, SDLoc(N), MVT::i32),
-                   CurDAG->getRegister(0, MVT::i32), Chain};
-  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), LD->getValueType(0),
+  SDValue Ops[] = {Base, NewOffset,
+                   CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
+                   Chain};
+  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0),
                                        MVT::i32, MVT::Other, Ops);
   transferMemOperands(N, New);
   ReplaceUses(SDValue(N, 0), SDValue(New, 1));
@@ -3292,6 +3343,11 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     // Other cases are autogenerated.
     break;
   }
+  case ISD::MLOAD:
+    if (Subtarget->hasMVEIntegerOps() && tryMVEIndexedLoad(N))
+      return;
+    // Other cases are autogenerated.
+    break;
   case ARMISD::WLS:
   case ARMISD::LE: {
     SDValue Ops[] = { N->getOperand(1),
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e359756b7bf45..3dcddd73f309d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -209,6 +209,9 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
       VT != MVT::v2i64 && VT != MVT::v1i64)
     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
+  if (!VT.isFloatingPoint())
+    for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
+      setOperationAction(Opcode, VT, Legal);
 }
 
 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
@@ -296,6 +299,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im, VT, Legal);
       setIndexedStoreAction(im, VT, Legal);
+      setIndexedMaskedLoadAction(im, VT, Legal);
+      setIndexedMaskedStoreAction(im, VT, Legal);
     }
   }
 
@@ -322,6 +327,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im, VT, Legal);
       setIndexedStoreAction(im, VT, Legal);
+      setIndexedMaskedLoadAction(im, VT, Legal);
+      setIndexedMaskedStoreAction(im, VT, Legal);
     }
 
     if (HasMVEFP) {
@@ -374,12 +381,12 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
   // Pre and Post inc on these are legal, given the correct extends
   for (unsigned im = (unsigned)ISD::PRE_INC;
        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-    setIndexedLoadAction(im, MVT::v8i8, Legal);
-    setIndexedStoreAction(im, MVT::v8i8, Legal);
-    setIndexedLoadAction(im, MVT::v4i8, Legal);
-    setIndexedStoreAction(im, MVT::v4i8, Legal);
-    setIndexedLoadAction(im, MVT::v4i16, Legal);
-    setIndexedStoreAction(im, MVT::v4i16, Legal);
+    for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
+      setIndexedLoadAction(im, VT, Legal);
+      setIndexedStoreAction(im, VT, Legal);
+      setIndexedMaskedLoadAction(im, VT, Legal);
+      setIndexedMaskedStoreAction(im, VT, Legal);
+    }
   }
 
   // Predicate types
@@ -5572,15 +5579,9 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 Register ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               const MachineFunction &MF) const {
   Register Reg = StringSwitch<unsigned>(RegName)
-                     .Case("r6", ARM::R6)
-                     .Case("r7", ARM::R7)
-                     .Case("r8", ARM::R8)
-                     .Case("r9", ARM::R9)
-                     .Case("r10", ARM::R10)
-                     .Case("r11", ARM::R11)
-                     .Case("sp", ARM::SP)
-                     .Default(ARM::NoRegister);
-  if (Reg != ARM::NoRegister)
+                       .Case("sp", ARM::SP)
+                       .Default(0);
+  if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
                               + StringRef(RegName)  + "\"."));
@@ -8992,6 +8993,12 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
       ST->getMemOperand());
 }
 
+static bool isZeroVector(SDValue N) {
+  return (ISD::isBuildVectorAllZeros(N.getNode()) ||
+          (N->getOpcode() == ARMISD::VMOVIMM &&
+           isNullConstant(N->getOperand(0))));
+}
+
 static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
   MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
   MVT VT = Op.getSimpleValueType();
@@ -8999,13 +9006,7 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
   SDValue PassThru = N->getPassThru();
   SDLoc dl(Op);
 
-  auto IsZero = [](SDValue PassThru) {
-    return (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
-      (PassThru->getOpcode() == ARMISD::VMOVIMM &&
-       isNullConstant(PassThru->getOperand(0))));
-  };
-
-  if (IsZero(PassThru))
+  if (isZeroVector(PassThru))
     return Op;
 
   // MVE Masked loads use zero as the passthru value. Here we convert undef to
@@ -9013,12 +9014,13 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
   SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
                                 DAG.getTargetConstant(0, dl, MVT::i32));
   SDValue NewLoad = DAG.getMaskedLoad(
-      VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
-      N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+      VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
+      N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+      N->getExtensionType(), N->isExpandingLoad());
   SDValue Combo = NewLoad;
   if (!PassThru.isUndef() &&
       (PassThru.getOpcode() != ISD::BITCAST ||
-       !IsZero(PassThru->getOperand(0))))
+       !isZeroVector(PassThru->getOperand(0))))
     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
 }
@@ -12741,6 +12743,39 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   return SDValue();
 }
 
+static SDValue PerformVCMPCombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEIntegerOps())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  ARMCC::CondCodes Cond =
+      (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  SDLoc dl(N);
+
+  // vcmp X, 0, cc -> vcmpz X, cc
+  if (isZeroVector(Op1))
+    return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0,
+                           N->getOperand(2));
+
+  unsigned SwappedCond = getSwappedCondition(Cond);
+  if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
+    // vcmp 0, X, cc -> vcmpz X, reversed(cc)
+    if (isZeroVector(Op0))
+      return DCI.DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
+                             DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+    // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
+    if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
+      return DCI.DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
+                             DCI.DAG.getConstant(SwappedCond, dl, MVT::i32));
+  }
+
+  return SDValue();
+}
+
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
@@ -14421,6 +14456,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VCMP:
+    return PerformVCMPCombine(N, DCI, Subtarget);
   case ARMISD::SMULWB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -15192,14 +15229,19 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
 }
 
 static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
-                                      bool isSEXTLoad, bool isLE, SDValue &Base,
-                                      SDValue &Offset, bool &isInc,
-                                      SelectionDAG &DAG) {
+                                      bool isSEXTLoad, bool IsMasked, bool isLE,
+                                      SDValue &Base, SDValue &Offset,
+                                      bool &isInc, SelectionDAG &DAG) {
   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
     return false;
   if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
     return false;
 
+  // We allow LE non-masked loads to change the type (for example use a vldrb.8
+  // as opposed to a vldrw.32). This can allow extra addressing modes or
+  // alignments for what is otherwise an equivalent instruction.
+  bool CanChangeType = isLE && !IsMasked;
+
   ConstantSDNode *RHS = cast<ConstantSDNode>(Ptr->getOperand(1));
   int RHSC = (int)RHS->getZExtValue();
 
@@ -15218,7 +15260,7 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
   };
 
   // Try to find a matching instruction based on s/zext, Alignment, Offset and
-  // (in BE) type.
+  // (in BE/masked) type.
   Base = Ptr->getOperand(0);
   if (VT == MVT::v4i16) {
     if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
@@ -15226,13 +15268,15 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
   } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
     if (IsInRange(RHSC, 0x80, 1))
       return true;
-  } else if (Align >= 4 && (isLE || VT == MVT::v4i32 || VT == MVT::v4f32) &&
+  } else if (Align >= 4 &&
+             (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
              IsInRange(RHSC, 0x80, 4))
     return true;
-  else if (Align >= 2 && (isLE || VT == MVT::v8i16 || VT == MVT::v8f16) &&
+  else if (Align >= 2 &&
+           (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
            IsInRange(RHSC, 0x80, 2))
     return true;
-  else if ((isLE || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
+  else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
     return true;
   return false;
 }
@@ -15252,6 +15296,7 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   SDValue Ptr;
   unsigned Align;
   bool isSEXTLoad = false;
+  bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
@@ -15261,6 +15306,17 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
     Ptr = ST->getBasePtr();
     VT = ST->getMemoryVT();
     Align = ST->getAlignment();
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+    Align = LD->getAlignment();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    IsMasked = true;
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT = ST->getMemoryVT();
+    Align = ST->getAlignment();
+    IsMasked = true;
   } else
     return false;
 
@@ -15269,8 +15325,8 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
               getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
-                                        Subtarget->isLittle(), Base, Offset,
-                                        isInc, DAG);
+                                        IsMasked, Subtarget->isLittle(), Base,
+                                        Offset, isInc, DAG);
   else {
     if (Subtarget->isThumb2())
       isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15298,6 +15354,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   SDValue Ptr;
   unsigned Align;
   bool isSEXTLoad = false, isNonExt;
+  bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
@@ -15309,6 +15366,19 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
     Ptr = ST->getBasePtr();
     Align = ST->getAlignment();
     isNonExt = !ST->isTruncatingStore();
+  } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+    Align = LD->getAlignment();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
+    IsMasked = true;
+  } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+    Align = ST->getAlignment();
+    isNonExt = !ST->isTruncatingStore();
+    IsMasked = true;
   } else
     return false;
 
@@ -15332,7 +15402,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   bool isLegal = false;
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
-              getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad,
+              getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
                                         Subtarget->isLittle(), Base, Offset,
                                         isInc, DAG);
   else {
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index fe696222ec70a..155e0efff1a8a 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -297,6 +297,28 @@ class RegConstraint<string C> {
   string Constraints = C;
 }
 
+// ARMCC condition codes. See ARMCC::CondCodes
+def ARMCCeq : PatLeaf<(i32 0)>;
+def ARMCCne : PatLeaf<(i32 1)>;
+def ARMCChs : PatLeaf<(i32 2)>;
+def ARMCClo : PatLeaf<(i32 3)>;
+def ARMCCmi : PatLeaf<(i32 4)>;
+def ARMCCpl : PatLeaf<(i32 5)>;
+def ARMCCvs : PatLeaf<(i32 6)>;
+def ARMCCvc : PatLeaf<(i32 7)>;
+def ARMCChi : PatLeaf<(i32 8)>;
+def ARMCCls : PatLeaf<(i32 9)>;
+def ARMCCge : PatLeaf<(i32 10)>;
+def ARMCClt : PatLeaf<(i32 11)>;
+def ARMCCgt : PatLeaf<(i32 12)>;
+def ARMCCle : PatLeaf<(i32 13)>;
+def ARMCCal : PatLeaf<(i32 14)>;
+
+// VCC predicates. See ARMVCC::VPTCodes
+def ARMVCCNone : PatLeaf<(i32 0)>;
+def ARMVCCThen : PatLeaf<(i32 1)>;
+def ARMVCCElse : PatLeaf<(i32 2)>;
+
 //===----------------------------------------------------------------------===//
 //  ARM specific transformation functions and pattern fragments.
 //
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 429d0a1cf1bdf..c81e60b3360a2 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1015,6 +1015,16 @@ let Predicates = [HasMVEFloat] in {
             (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
   def : Pat<(v8f16 (fmaxnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
             (v8f16 (MVE_VMAXNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+  def : Pat<(v4f32 (int_arm_mve_max_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+                          (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+            (v4f32 (MVE_VMAXNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+                          ARMVCCThen, (v4i1 VCCR:$mask),
+                          (v4f32 MQPR:$inactive)))>;
+  def : Pat<(v8f16 (int_arm_mve_max_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+                          (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+            (v8f16 (MVE_VMAXNMf32 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+                          ARMVCCThen, (v8i1 VCCR:$mask),
+                          (v8f16 MQPR:$inactive)))>;
 }
 
 def MVE_VMINNMf32 : MVE_VMINMAXNM<"vminnm", "f32", 0b0, 0b1>;
@@ -1025,6 +1035,16 @@ let Predicates = [HasMVEFloat] in {
             (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
   def : Pat<(v8f16 (fminnum (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
             (v8f16 (MVE_VMINNMf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+  def : Pat<(v4f32 (int_arm_mve_min_predicated (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+                          (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+            (v4f32 (MVE_VMINNMf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2),
+                          ARMVCCThen, (v4i1 VCCR:$mask),
+                          (v4f32 MQPR:$inactive)))>;
+  def : Pat<(v8f16 (int_arm_mve_min_predicated (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+                          (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+            (v8f16 (MVE_VMINNMf32 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2),
+                          ARMVCCThen, (v8i1 VCCR:$mask),
+                          (v8f16 MQPR:$inactive)))>;
 }
 
 
@@ -1042,48 +1062,45 @@ class MVE_VMINMAX<string iname, string suffix, bit U, bits<2> size,
   let Inst{4} = bit_4;
 }
 
-multiclass MVE_VMINMAX_all_sizes<string iname, bit bit_4> {
-  def s8  : MVE_VMINMAX<iname, "s8",  0b0, 0b00, bit_4>;
-  def s16 : MVE_VMINMAX<iname, "s16", 0b0, 0b01, bit_4>;
-  def s32 : MVE_VMINMAX<iname, "s32", 0b0, 0b10, bit_4>;
-  def u8  : MVE_VMINMAX<iname, "u8",  0b1, 0b00, bit_4>;
-  def u16 : MVE_VMINMAX<iname, "u16", 0b1, 0b01, bit_4>;
-  def u32 : MVE_VMINMAX<iname, "u32", 0b1, 0b10, bit_4>;
-}
+multiclass MVE_VMINMAX_m<string iname, bit bit_4, MVEVectorVTInfo VTI,
+                      SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VMINMAX<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, bit_4>;
 
-defm MVE_VMAX : MVE_VMINMAX_all_sizes<"vmax", 0b0>;
-defm MVE_VMIN : MVE_VMINMAX_all_sizes<"vmin", 0b1>;
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated min/max
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (smin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VMINs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (smin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VMINs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (smin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VMINs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (smax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VMAXs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (smax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VMAXs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (smax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VMAXs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (umin (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VMINu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (umin (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VMINu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (umin (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VMINu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (umax (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VMAXu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (umax (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VMAXu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (umax (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VMAXu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+    // Predicated min/max
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
+multiclass MVE_VMAX<MVEVectorVTInfo VTI>
+  : MVE_VMINMAX_m<"vmax", 0b0, VTI, !if(VTI.Unsigned, umax, smax), int_arm_mve_max_predicated>;
+multiclass MVE_VMIN<MVEVectorVTInfo VTI>
+  : MVE_VMINMAX_m<"vmin", 0b1, VTI, !if(VTI.Unsigned, umin, smin), int_arm_mve_min_predicated>;
+
+defm MVE_VMINs8   : MVE_VMIN<MVE_v16s8>;
+defm MVE_VMINs16  : MVE_VMIN<MVE_v8s16>;
+defm MVE_VMINs32  : MVE_VMIN<MVE_v4s32>;
+defm MVE_VMINu8   : MVE_VMIN<MVE_v16u8>;
+defm MVE_VMINu16  : MVE_VMIN<MVE_v8u16>;
+defm MVE_VMINu32  : MVE_VMIN<MVE_v4u32>;
+
+defm MVE_VMAXs8   : MVE_VMAX<MVE_v16s8>;
+defm MVE_VMAXs16  : MVE_VMAX<MVE_v8s16>;
+defm MVE_VMAXs32  : MVE_VMAX<MVE_v4s32>;
+defm MVE_VMAXu8   : MVE_VMAX<MVE_v16u8>;
+defm MVE_VMAXu16  : MVE_VMAX<MVE_v8u16>;
+defm MVE_VMAXu32  : MVE_VMAX<MVE_v4u32>;
+
 // end of mve_comp instructions
 
 // start of mve_bit instructions
@@ -1233,53 +1250,61 @@ foreach s=["s8", "s16", "s32", "u8", "u16", "u32", "i8", "i16", "i32", "f16", "f
         (MVE_VAND MQPR:$QdSrc, MQPR:$QnSrc, MQPR:$QmSrc, vpred_r:$vp)>;
 }
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VAND (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VAND (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VAND (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-  def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
-            (v2i64 (MVE_VAND (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VORR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VORR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VORR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-  def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
-            (v2i64 (MVE_VORR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (xor (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VEOR (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (xor (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VEOR (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (xor (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VEOR (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-  def : Pat<(v2i64 (xor (v2i64 MQPR:$val1), (v2i64 MQPR:$val2))),
-            (v2i64 (MVE_VEOR (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (and (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v16i8 (MVE_VBIC (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (and (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v8i16 (MVE_VBIC (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (and (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v4i32 (MVE_VBIC (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-  def : Pat<(v2i64 (and (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v2i64 (MVE_VBIC (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
-
-  def : Pat<(v16i8 (or (v16i8 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v16i8 (MVE_VORN (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (or (v8i16 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v8i16 (MVE_VORN (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (or (v4i32 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v4i32 (MVE_VORN (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
-  def : Pat<(v2i64 (or (v2i64 MQPR:$val1), (vnotq MQPR:$val2))),
-            (v2i64 (MVE_VORN (v2i64 MQPR:$val1), (v2i64 MQPR:$val2)))>;
+multiclass MVE_bit_op<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated operation
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    // Predicated operation
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (instruction
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
+defm : MVE_bit_op<MVE_v16i8, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v8i16, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v4i32, and, int_arm_mve_and_predicated, MVE_VAND>;
+defm : MVE_bit_op<MVE_v2i64, and, int_arm_mve_and_predicated, MVE_VAND>;
+
+defm : MVE_bit_op<MVE_v16i8, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v8i16, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v4i32, or, int_arm_mve_orr_predicated, MVE_VORR>;
+defm : MVE_bit_op<MVE_v2i64, or, int_arm_mve_orr_predicated, MVE_VORR>;
+
+defm : MVE_bit_op<MVE_v16i8, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v8i16, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v4i32, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+defm : MVE_bit_op<MVE_v2i64, xor, int_arm_mve_eor_predicated, MVE_VEOR>;
+
+multiclass MVE_bit_op_with_inv<MVEVectorVTInfo VTI, SDNode unpred_op, Intrinsic pred_int, MVE_bit_ops instruction> {
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated operation
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (vnotq (VTI.Vec MQPR:$Qn)))),
+              (VTI.Vec (instruction (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    // Predicated operation
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (instruction
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+defm : MVE_bit_op_with_inv<MVE_v16i8, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v8i16, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v4i32, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+defm : MVE_bit_op_with_inv<MVE_v2i64, and, int_arm_mve_bic_predicated, MVE_VBIC>;
+
+defm : MVE_bit_op_with_inv<MVE_v16i8, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
+defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
+
 class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
   : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
           iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
@@ -1512,8 +1537,9 @@ class MVE_int<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
   let Inst{3-1} = Qm{2-0};
 }
 
-class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]>
-  : MVE_int<"vmul", suffix, size, pattern> {
+class MVE_VMULt1<string iname, string suffix, bits<2> size,
+                   list<dag> pattern=[]>
+  : MVE_int<iname, suffix, size, pattern> {
 
   let Inst{28} = 0b0;
   let Inst{25-23} = 0b110;
@@ -1524,19 +1550,33 @@ class MVE_VMULt1<string suffix, bits<2> size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VMULt1i8  : MVE_VMULt1<"i8", 0b00>;
-def MVE_VMULt1i16 : MVE_VMULt1<"i16", 0b01>;
-def MVE_VMULt1i32 : MVE_VMULt1<"i32", 0b10>;
+multiclass MVE_VMUL_m<string iname, MVEVectorVTInfo VTI,
+                      SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VMULt1<iname, VTI.Suffix, VTI.Size>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
-            (v16i8 (MVE_VMULt1i8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
-  def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
-            (v8i16 (MVE_VMULt1i16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
-  def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))),
-            (v4i32 (MVE_VMULt1i32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)))>;
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated multiply
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+    // Predicated multiply
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
+multiclass MVE_VMUL<MVEVectorVTInfo VTI>
+  : MVE_VMUL_m<"vmul", VTI, mul, int_arm_mve_mul_predicated>;
+
+defm MVE_VMULi8  : MVE_VMUL<MVE_v16i8>;
+defm MVE_VMULi16 : MVE_VMUL<MVE_v8i16>;
+defm MVE_VMULi32 : MVE_VMUL<MVE_v4i32>;
+
 class MVE_VQxDMULH<string iname, string suffix, bits<2> size, bit rounding,
                   list<dag> pattern=[]>
   : MVE_int<iname, suffix, size, pattern> {
@@ -1590,7 +1630,7 @@ multiclass MVE_VADDSUB_m<string iname, MVEVectorVTInfo VTI, bit subtract,
                             (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 1), (VTI.Pred VCCR:$mask),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
                             (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -1664,7 +1704,8 @@ let Predicates = [HasMVEInt] in {
 }
 
 
-class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
+class MVE_VABD_int<string suffix, bit U, bits<2> size,
+                     list<dag> pattern=[]>
   : MVE_int<"vabd", suffix, size, pattern> {
 
   let Inst{28} = U;
@@ -1676,12 +1717,35 @@ class MVE_VABD_int<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VABDs8  : MVE_VABD_int<"s8", 0b0, 0b00>;
-def MVE_VABDs16 : MVE_VABD_int<"s16", 0b0, 0b01>;
-def MVE_VABDs32 : MVE_VABD_int<"s32", 0b0, 0b10>;
-def MVE_VABDu8  : MVE_VABD_int<"u8", 0b1, 0b00>;
-def MVE_VABDu16 : MVE_VABD_int<"u16", 0b1, 0b01>;
-def MVE_VABDu32 : MVE_VABD_int<"u32", 0b1, 0b10>;
+multiclass MVE_VABD_m<MVEVectorVTInfo VTI,
+                      Intrinsic unpred_int, Intrinsic pred_int> {
+  def "" : MVE_VABD_int<VTI.Suffix, VTI.Unsigned, VTI.Size>;
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated absolute difference
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+    // Predicated absolute difference
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VABD<MVEVectorVTInfo VTI>
+  : MVE_VABD_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
+
+defm MVE_VABDs8  : MVE_VABD<MVE_v16s8>;
+defm MVE_VABDs16 : MVE_VABD<MVE_v8s16>;
+defm MVE_VABDs32 : MVE_VABD<MVE_v4s32>;
+defm MVE_VABDu8  : MVE_VABD<MVE_v16u8>;
+defm MVE_VABDu16 : MVE_VABD<MVE_v8u16>;
+defm MVE_VABDu32 : MVE_VABD<MVE_v4u32>;
 
 class MVE_VRHADD<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   : MVE_int<"vrhadd", suffix, size, pattern> {
@@ -1738,60 +1802,6 @@ def MVE_VHSUBu8  : MVE_VHSUB<"u8",  0b1, 0b00>;
 def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>;
 def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (ARMvshrsImm
-                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
-            (v16i8 (MVE_VHADDs8
-                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
-  def : Pat<(v8i16 (ARMvshrsImm
-                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
-            (v8i16 (MVE_VHADDs16
-                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
-  def : Pat<(v4i32 (ARMvshrsImm
-                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
-            (v4i32 (MVE_VHADDs32
-                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
-  def : Pat<(v16i8 (ARMvshruImm
-                     (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
-            (v16i8 (MVE_VHADDu8
-                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
-  def : Pat<(v8i16 (ARMvshruImm
-                     (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
-            (v8i16 (MVE_VHADDu16
-                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
-  def : Pat<(v4i32 (ARMvshruImm
-                     (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
-            (v4i32 (MVE_VHADDu32
-                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
-  def : Pat<(v16i8 (ARMvshrsImm
-                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
-            (v16i8 (MVE_VHSUBs8
-                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
-  def : Pat<(v8i16 (ARMvshrsImm
-                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
-            (v8i16 (MVE_VHSUBs16
-                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
-  def : Pat<(v4i32 (ARMvshrsImm
-                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
-            (v4i32 (MVE_VHSUBs32
-                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-
-  def : Pat<(v16i8 (ARMvshruImm
-                     (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)),
-            (v16i8 (MVE_VHSUBu8
-                     (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>;
-  def : Pat<(v8i16 (ARMvshruImm
-                     (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)),
-            (v8i16 (MVE_VHSUBu16
-                     (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>;
-  def : Pat<(v4i32 (ARMvshruImm
-                     (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)),
-            (v4i32 (MVE_VHSUBu32
-                     (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>;
-}
-
 class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
           "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> {
@@ -1969,17 +1979,17 @@ multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max,
     // the following vectorized expression (r being the value in $reg):
     // r > 0 ? r : (r == INT_MIN ? INT_MAX : -r)
     def : Pat<(VTI.Vec (vselect
-                      (VTI.Pred (ARMvcmpz (VTI.Vec MQPR:$reg), (i32 12))),
+                      (VTI.Pred (ARMvcmpz (VTI.Vec MQPR:$reg), ARMCCgt)),
                       (VTI.Vec MQPR:$reg),
                       (VTI.Vec (vselect
-                                (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, (i32 0))),
+                                (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
                                 int_max,
                                 (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))))),
             (VTI.Vec (vqabs_instruction (VTI.Vec MQPR:$reg)))>;
     // Similarly, this tree represents vqneg, i.e. the following vectorized expression:
     // r == INT_MIN ? INT_MAX : -r
     def : Pat<(VTI.Vec (vselect
-                        (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, (i32 0))),
+                        (VTI.Pred (ARMvcmp (VTI.Vec MQPR:$reg), int_min, ARMCCeq)),
                         int_max,
                         (sub (VTI.Vec zero_vec), (VTI.Vec MQPR:$reg)))),
                (VTI.Vec (vqneg_instruction (VTI.Vec MQPR:$reg)))>;
@@ -2781,8 +2791,8 @@ class MVEFloatArithNeon<string iname, string suffix, bit size,
   let Inst{16} = 0b0;
 }
 
-class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
-  : MVEFloatArithNeon<"vmul", suffix, size, (outs MQPR:$Qd),
+class MVE_VMUL_fp<string iname, string suffix, bit size, list<dag> pattern=[]>
+  : MVEFloatArithNeon<iname, suffix, size, (outs MQPR:$Qd),
                       (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm", vpred_r, "",
                       pattern> {
   bits<4> Qd;
@@ -2800,16 +2810,29 @@ class MVE_VMUL_fp<string suffix, bit size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VMULf32 : MVE_VMUL_fp<"f32", 0b0>;
-def MVE_VMULf16 : MVE_VMUL_fp<"f16", 0b1>;
+multiclass MVE_VMULT_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
+                            SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VMUL_fp<iname, VTI.Suffix, VTI.Size{0}>;
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (fmul (v4f32 MQPR:$val1), (v4f32 MQPR:$val2))),
-            (v4f32 (MVE_VMULf32 (v4f32 MQPR:$val1), (v4f32 MQPR:$val2)))>;
-  def : Pat<(v8f16 (fmul (v8f16 MQPR:$val1), (v8f16 MQPR:$val2))),
-            (v8f16 (MVE_VMULf16 (v8f16 MQPR:$val1), (v8f16 MQPR:$val2)))>;
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
+multiclass MVE_VMUL_fp_m<MVEVectorVTInfo VTI>
+  : MVE_VMULT_fp_m<"vmul", 0, VTI, fmul, int_arm_mve_mul_predicated>;
+
+defm MVE_VMULf32 : MVE_VMUL_fp_m<MVE_v4f32>;
+defm MVE_VMULf16 : MVE_VMUL_fp_m<MVE_v8f16>;
+
 class MVE_VCMLA<string suffix, bit size, list<dag> pattern=[]>
   : MVEFloatArithNeon<"vcmla", suffix, size, (outs MQPR:$Qd),
                          (ins MQPR:$Qd_src, MQPR:$Qn, MQPR:$Qm, complexrotateop:$rot),
@@ -2890,7 +2913,7 @@ multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
                             (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
               (VTI.Vec (!cast<Instruction>(NAME)
                             (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                            (i32 1), (VTI.Pred VCCR:$mask),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
                             (VTI.Vec MQPR:$inactive)))>;
   }
 }
@@ -2950,8 +2973,28 @@ class MVE_VABD_fp<string suffix, bit size>
   let validForTailPredication = 1;
 }
 
-def MVE_VABDf32 : MVE_VABD_fp<"f32", 0b0>;
-def MVE_VABDf16 : MVE_VABD_fp<"f16", 0b1>;
+multiclass MVE_VABDT_fp_m<MVEVectorVTInfo VTI,
+                            Intrinsic unpred_int, Intrinsic pred_int> {
+  def "" : MVE_VABD_fp<VTI.Suffix, VTI.Size{0}>;
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VABD_fp_m<MVEVectorVTInfo VTI>
+  : MVE_VABDT_fp_m<VTI, int_arm_mve_vabd, int_arm_mve_abd_predicated>;
+
+defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
+defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
 
 class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
                    Operand imm_operand_type, list<dag> pattern=[]>
@@ -3303,155 +3346,120 @@ def MVE_VCMPs8r  : MVE_VCMPqrs<"s8",  0b00>;
 def MVE_VCMPs16r : MVE_VCMPqrs<"s16", 0b01>;
 def MVE_VCMPs32r : MVE_VCMPqrs<"s32", 0b10>;
 
-multiclass unpred_vcmp_z<string suffix, int fc> {
-  def i8  : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))),
+multiclass unpred_vcmp_z<string suffix, PatLeaf fc> {
+  def i8  : Pat<(v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)),
                 (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc))>;
-  def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))),
+  def i16 : Pat<(v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)),
                 (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc))>;
-  def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))),
+  def i32 : Pat<(v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)),
                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc))>;
 
-  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), (i32 fc))))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), (i32 fc))))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), (i32 fc))))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmpz (v16i8 MQPR:$v1), fc)))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8i16 MQPR:$v1), fc)))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4i32 MQPR:$v1), fc)))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
 }
 
-multiclass unpred_vcmp_r<string suffix, int fc> {
-  def i8  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))),
+multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
+  def i8  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)),
                 (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc))>;
-  def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))),
+  def i16 : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)),
                 (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc))>;
-  def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))),
+  def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
 
-  def i8r  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))),
+  def i8r  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
                  (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
-  def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))),
+  def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
                  (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
-  def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))),
+  def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
                  (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
 
-  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), (i32 fc))))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, 1, VCCR:$p1))>;
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), (i32 fc))))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), (i32 fc))))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, 1, VCCR:$p1))>;
-
-  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), (i32 fc))))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), (i32 fc))))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), (i32 fc))))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, 1, VCCR:$p1))>;
-}
-
-multiclass unpred_vcmp_r_reversible<string suffix, int fc, int fcReversed> {
-  defm "":  unpred_vcmp_r<suffix, fc>;
-
-  // Additional patterns that match the vector/scalar comparisons the
-  // opposite way round, with the ARMvdup in the first operand of the
-  // ARMvcmp. These will usually need a different condition code
-  // (except for the symmetric conditions EQ and NE). They're in a
-  // separate multiclass because the unsigned CS and HI comparisons
-  // don't have reversed forms.
-
-  def : Pat<(v16i1 (ARMvcmp (v16i8 (ARMvdup GPR:$v1)), (v16i8 MQPR:$v2), (i32 fc))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v2), (i32 GPR:$v1), fcReversed))>;
-  def : Pat<(v8i1 (ARMvcmp (v8i16 (ARMvdup GPR:$v1)), (v8i16 MQPR:$v2), (i32 fc))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v2), (i32 GPR:$v1), fcReversed))>;
-  def : Pat<(v4i1 (ARMvcmp (v4i32 (ARMvdup GPR:$v1)), (v4i32 MQPR:$v2), (i32 fc))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v2), (i32 GPR:$v1), fcReversed))>;
-
-  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 (ARMvdup GPR:$v1)), (v16i8 MQPR:$v2), (i32 fc))))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v2), (i32 GPR:$v1), fcReversed, 1, VCCR:$p1))>;
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 (ARMvdup GPR:$v1)), (v8i16 MQPR:$v2), (i32 fc))))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v2), (i32 GPR:$v1), fcReversed, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 (ARMvdup GPR:$v1)), (v4i32 MQPR:$v2), (i32 fc))))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v2), (i32 GPR:$v1), fcReversed, 1, VCCR:$p1))>;
-}
-
-multiclass unpred_vcmpf_z<int fc> {
-  def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))),
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc)))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16") (v8i16 MQPR:$v1), (v8i16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+}
+
+multiclass unpred_vcmpf_z<PatLeaf fc> {
+  def f16 : Pat<(v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)),
                 (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc))>;
-  def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))),
+  def f32 : Pat<(v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)),
                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
 
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), (i32 fc))))),
-            (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), (i32 fc))))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
+            (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
 }
 
-multiclass unpred_vcmpf_r<int fc, int fcReversed> {
-  def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))),
+multiclass unpred_vcmpf_r<int fc> {
+  def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
                 (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
-  def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))),
+  def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
                 (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
 
-  def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))),
+  def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
                  (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
-  def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))),
+  def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
                  (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
 
-  def      : Pat<(v8i1 (ARMvcmp (v8f16 (ARMvdup HPR:$v1)), (v8f16 MQPR:$v2), (i32 fc))),
-                 (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v2), (i32 (COPY_TO_REGCLASS (f16 HPR:$v1), rGPR)), fcReversed))>;
-  def      : Pat<(v4i1 (ARMvcmp (v4f32 (ARMvdup SPR:$v1)), (v4f32 MQPR:$v2), (i32 fc))),
-                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v2), (i32 (COPY_TO_REGCLASS (f32 SPR:$v1), rGPR)), fcReversed))>;
-
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), (i32 fc))))),
-            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), (i32 fc))))),
-            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
+            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
+            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
 
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), (i32 fc))))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), (i32 fc))))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, 1, VCCR:$p1))>;
-
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 (ARMvdup HPR:$v1)), (v8f16 MQPR:$v2), (i32 fc))))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v2), (i32 (COPY_TO_REGCLASS (f16 HPR:$v1), rGPR)), fcReversed, 1, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 (ARMvdup SPR:$v1)), (v4f32 MQPR:$v2), (i32 fc))))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v2), (i32 (COPY_TO_REGCLASS (f32 SPR:$v1), rGPR)), fcReversed, 1, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
 }
 
 let Predicates = [HasMVEInt] in {
-  defm MVE_VCEQZ  : unpred_vcmp_z<"i", 0>;
-  defm MVE_VCNEZ  : unpred_vcmp_z<"i", 1>;
-  defm MVE_VCGEZ  : unpred_vcmp_z<"s", 10>;
-  defm MVE_VCLTZ  : unpred_vcmp_z<"s", 11>;
-  defm MVE_VCGTZ  : unpred_vcmp_z<"s", 12>;
-  defm MVE_VCLEZ  : unpred_vcmp_z<"s", 13>;
-  defm MVE_VCGTUZ : unpred_vcmp_z<"u", 8>;
-  defm MVE_VCGEUZ : unpred_vcmp_z<"u", 2>;
-
-  defm MVE_VCEQ   : unpred_vcmp_r_reversible<"i", 0, 0>;
-  defm MVE_VCNE   : unpred_vcmp_r_reversible<"i", 1, 1>;
-  defm MVE_VCGE   : unpred_vcmp_r_reversible<"s", 10, 13>;
-  defm MVE_VCLT   : unpred_vcmp_r_reversible<"s", 11, 12>;
-  defm MVE_VCGT   : unpred_vcmp_r_reversible<"s", 12, 11>;
-  defm MVE_VCLE   : unpred_vcmp_r_reversible<"s", 13, 10>;
-  defm MVE_VCGTU  : unpred_vcmp_r<"u", 8>;
-  defm MVE_VCGEU  : unpred_vcmp_r<"u", 2>;
+  defm MVE_VCEQZ  : unpred_vcmp_z<"i", ARMCCeq>;
+  defm MVE_VCNEZ  : unpred_vcmp_z<"i", ARMCCne>;
+  defm MVE_VCGEZ  : unpred_vcmp_z<"s", ARMCCge>;
+  defm MVE_VCLTZ  : unpred_vcmp_z<"s", ARMCClt>;
+  defm MVE_VCGTZ  : unpred_vcmp_z<"s", ARMCCgt>;
+  defm MVE_VCLEZ  : unpred_vcmp_z<"s", ARMCCle>;
+  defm MVE_VCGTUZ : unpred_vcmp_z<"u", ARMCChi>;
+  defm MVE_VCGEUZ : unpred_vcmp_z<"u", ARMCChs>;
+
+  defm MVE_VCEQ   : unpred_vcmp_r<"i", ARMCCeq>;
+  defm MVE_VCNE   : unpred_vcmp_r<"i", ARMCCne>;
+  defm MVE_VCGE   : unpred_vcmp_r<"s", ARMCCge>;
+  defm MVE_VCLT   : unpred_vcmp_r<"s", ARMCClt>;
+  defm MVE_VCGT   : unpred_vcmp_r<"s", ARMCCgt>;
+  defm MVE_VCLE   : unpred_vcmp_r<"s", ARMCCle>;
+  defm MVE_VCGTU  : unpred_vcmp_r<"u", ARMCChi>;
+  defm MVE_VCGEU  : unpred_vcmp_r<"u", ARMCChs>;
 }
 
 let Predicates = [HasMVEFloat] in {
-  defm MVE_VFCEQZ  : unpred_vcmpf_z<0>;
-  defm MVE_VFCNEZ  : unpred_vcmpf_z<1>;
-  defm MVE_VFCGEZ  : unpred_vcmpf_z<10>;
-  defm MVE_VFCLTZ  : unpred_vcmpf_z<11>;
-  defm MVE_VFCGTZ  : unpred_vcmpf_z<12>;
-  defm MVE_VFCLEZ  : unpred_vcmpf_z<13>;
+  defm MVE_VFCEQZ  : unpred_vcmpf_z<ARMCCeq>;
+  defm MVE_VFCNEZ  : unpred_vcmpf_z<ARMCCne>;
+  defm MVE_VFCGEZ  : unpred_vcmpf_z<ARMCCge>;
+  defm MVE_VFCLTZ  : unpred_vcmpf_z<ARMCClt>;
+  defm MVE_VFCGTZ  : unpred_vcmpf_z<ARMCCgt>;
+  defm MVE_VFCLEZ  : unpred_vcmpf_z<ARMCCle>;
 
-  defm MVE_VFCEQ   : unpred_vcmpf_r<0, 0>;
-  defm MVE_VFCNE   : unpred_vcmpf_r<1, 1>;
-  defm MVE_VFCGE   : unpred_vcmpf_r<10, 13>;
-  defm MVE_VFCLT   : unpred_vcmpf_r<11, 12>;
-  defm MVE_VFCGT   : unpred_vcmpf_r<12, 11>;
-  defm MVE_VFCLE   : unpred_vcmpf_r<13, 10>;
+  defm MVE_VFCEQ   : unpred_vcmpf_r<ARMCCeq>;
+  defm MVE_VFCNE   : unpred_vcmpf_r<ARMCCne>;
+  defm MVE_VFCGE   : unpred_vcmpf_r<ARMCCge>;
+  defm MVE_VFCLT   : unpred_vcmpf_r<ARMCClt>;
+  defm MVE_VFCGT   : unpred_vcmpf_r<ARMCCgt>;
+  defm MVE_VFCLE   : unpred_vcmpf_r<ARMCCle>;
 }
 
 
@@ -3615,8 +3623,8 @@ defm MVE_VMULLu32 : MVE_VMULL_multi<"vmull", "u32", 0b1, 0b10, "@earlyclobber $Q
 defm MVE_VMULLp8  : MVE_VMULL_multi<"vmull", "p8",  0b0, 0b11>;
 defm MVE_VMULLp16 : MVE_VMULL_multi<"vmull", "p16", 0b1, 0b11>;
 
-class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size,
-                 bit round, list<dag> pattern=[]>
+class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
+                 list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
                    (ins MQPR:$Qn, MQPR:$Qm), "$Qd, $Qn, $Qm",
                    vpred_r, "", pattern> {
@@ -3632,19 +3640,45 @@ class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size,
   let Inst{0} = 0b1;
 }
 
-def MVE_VMULHs8   : MVE_VxMULH<"vmulh",  "s8",  0b0, 0b00, 0b0>;
-def MVE_VMULHs16  : MVE_VxMULH<"vmulh",  "s16", 0b0, 0b01, 0b0>;
-def MVE_VMULHs32  : MVE_VxMULH<"vmulh",  "s32", 0b0, 0b10, 0b0>;
-def MVE_VMULHu8   : MVE_VxMULH<"vmulh",  "u8",  0b1, 0b00, 0b0>;
-def MVE_VMULHu16  : MVE_VxMULH<"vmulh",  "u16", 0b1, 0b01, 0b0>;
-def MVE_VMULHu32  : MVE_VxMULH<"vmulh",  "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VxMULH_m<string iname, MVEVectorVTInfo VTI, SDNode unpred_op,
+                        Intrinsic pred_int, bit round> {
+  def "" : MVE_VxMULH<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, round>;
 
-def MVE_VRMULHs8  : MVE_VxMULH<"vrmulh", "s8",  0b0, 0b00, 0b1>;
-def MVE_VRMULHs16 : MVE_VxMULH<"vrmulh", "s16", 0b0, 0b01, 0b1>;
-def MVE_VRMULHs32 : MVE_VxMULH<"vrmulh", "s32", 0b0, 0b10, 0b1>;
-def MVE_VRMULHu8  : MVE_VxMULH<"vrmulh", "u8",  0b1, 0b00, 0b1>;
-def MVE_VRMULHu16 : MVE_VxMULH<"vrmulh", "u16", 0b1, 0b01, 0b1>;
-def MVE_VRMULHu32 : MVE_VxMULH<"vrmulh", "u32", 0b1, 0b10, 0b1>;
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated multiply returning high bits
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+
+    // Predicated multiply returning high bits
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (!cast<Instruction>(NAME)
+                            (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask),
+                            (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VMULT<string iname, MVEVectorVTInfo VTI, bit round>
+  : MVE_VxMULH_m<iname, VTI, !if(round, int_arm_mve_vrmulh, int_arm_mve_vmulh),
+                 !if(round, int_arm_mve_rmulh_predicated,
+                            int_arm_mve_mulh_predicated),
+                 round>;
+
+defm MVE_VMULHs8   : MVE_VMULT<"vmulh",  MVE_v16s8, 0b0>;
+defm MVE_VMULHs16  : MVE_VMULT<"vmulh",  MVE_v8s16, 0b0>;
+defm MVE_VMULHs32  : MVE_VMULT<"vmulh",  MVE_v4s32, 0b0>;
+defm MVE_VMULHu8   : MVE_VMULT<"vmulh",  MVE_v16u8, 0b0>;
+defm MVE_VMULHu16  : MVE_VMULT<"vmulh",  MVE_v8u16, 0b0>;
+defm MVE_VMULHu32  : MVE_VMULT<"vmulh",  MVE_v4u32, 0b0>;
+
+defm MVE_VRMULHs8  : MVE_VMULT<"vrmulh", MVE_v16s8, 0b1>;
+defm MVE_VRMULHs16 : MVE_VMULT<"vrmulh", MVE_v8s16, 0b1>;
+defm MVE_VRMULHs32 : MVE_VMULT<"vrmulh", MVE_v4s32, 0b1>;
+defm MVE_VRMULHu8  : MVE_VMULT<"vrmulh", MVE_v16u8, 0b1>;
+defm MVE_VRMULHu16 : MVE_VMULT<"vrmulh", MVE_v8u16, 0b1>;
+defm MVE_VRMULHu32 : MVE_VMULT<"vrmulh", MVE_v4u32, 0b1>;
 
 class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
                   bits<2> size, bit T, list<dag> pattern=[]>
@@ -3716,7 +3750,7 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
                          (v4i1 VCCR:$mask))),
               (v8f16 (!cast<Instruction>(NAME)
                          (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
-                         (i32 1), (v4i1 VCCR:$mask)))>;
+                         ARMVCCThen, (v4i1 VCCR:$mask)))>;
   }
 }
 
@@ -4224,7 +4258,7 @@ def MVE_VDWDUPu16 : MVE_VxWDUP<"vdwdup", "u16", 0b01, 0b1>;
 def MVE_VDWDUPu32 : MVE_VxWDUP<"vdwdup", "u32", 0b10, 0b1>;
 
 let hasSideEffects = 1 in
-class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
+class MVE_VCTPInst<string suffix, bits<2> size, list<dag> pattern=[]>
   : MVE_p<(outs VCCR:$P0), (ins rGPR:$Rn), NoItinerary, "vctp", suffix,
           "$Rn", vpred_n, "", pattern> {
   bits<4> Rn;
@@ -4242,20 +4276,22 @@ class MVE_VCTP<string suffix, bits<2> size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
-def MVE_VCTP8  : MVE_VCTP<"8",  0b00>;
-def MVE_VCTP16 : MVE_VCTP<"16", 0b01>;
-def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
-def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
+multiclass MVE_VCTP<MVEVectorVTInfo VTI, Intrinsic intr> {
+  def "": MVE_VCTPInst<VTI.BitsSuffix, VTI.Size>;
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(int_arm_vctp8 rGPR:$Rn),
-            (v16i1 (MVE_VCTP8 rGPR:$Rn))>;
-  def : Pat<(int_arm_vctp16 rGPR:$Rn),
-            (v8i1 (MVE_VCTP16 rGPR:$Rn))>;
-  def : Pat<(int_arm_vctp32 rGPR:$Rn),
-            (v4i1 (MVE_VCTP32 rGPR:$Rn))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(intr rGPR:$Rn),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn))>;
+    def : Pat<(and (intr rGPR:$Rn), (VTI.Pred VCCR:$mask)),
+              (VTI.Pred (!cast<Instruction>(NAME) rGPR:$Rn, ARMVCCThen, VCCR:$mask))>;
+  }
 }
 
+defm MVE_VCTP8  : MVE_VCTP<MVE_v16i8, int_arm_mve_vctp8>;
+defm MVE_VCTP16 : MVE_VCTP<MVE_v8i16, int_arm_mve_vctp16>;
+defm MVE_VCTP32 : MVE_VCTP<MVE_v4i32, int_arm_mve_vctp32>;
+defm MVE_VCTP64 : MVE_VCTP<MVE_v2i64, int_arm_mve_vctp64>;
+
 // end of mve_qDest_rSrc
 
 // start of coproc mov
@@ -4718,9 +4754,9 @@ multiclass MVE_VLDR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)),
               (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>;
+              (VTI.Vec (!cast<Instruction>(NAME#"_u") GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>;
+              (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
   }
 }
 multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
@@ -4730,7 +4766,7 @@ multiclass MVE_VLDR_rq_b<list<MVEVectorVTInfo> VTIs> {
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)),
               (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets))>;
     def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))),
-              (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>;
+              (VTI.Vec (!cast<Instruction>(NAME) GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred))>;
   }
 }
 multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
@@ -4742,9 +4778,9 @@ multiclass MVE_VSTR_rq_w<MVE_memsz memsz, list<MVEVectorVTInfo> VTIs> {
     def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift),
               (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)),
-              (!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>;
+              (!cast<Instruction>(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)),
-              (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>;
+              (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
   }
 }
 multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
@@ -4754,7 +4790,7 @@ multiclass MVE_VSTR_rq_b<list<MVEVectorVTInfo> VTIs> {
     def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0),
               (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>;
     def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)),
-              (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>;
+              (!cast<Instruction>(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, ARMVCCThen, VCCR:$pred)>;
   }
 }
 
@@ -4835,7 +4871,7 @@ multiclass MVE_VLDR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated
                  (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))),
               (DVTI.Vec (!cast<Instruction>(NAME)
-                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>;
+                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
   }
 }
 multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
@@ -4851,7 +4887,7 @@ multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(int_arm_mve_vstr_scatter_base_predicated
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)),
               (!cast<Instruction>(NAME)
-                (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred)>;
+                (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred)>;
     def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))),
               (AVTI.Vec (!cast<Instruction>(NAME # "_pre")
@@ -4859,7 +4895,7 @@ multiclass MVE_VSTR_qi<MVE_memsz memsz, MVEVectorVTInfo AVTI,
     def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated
                 (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))),
               (AVTI.Vec (!cast<Instruction>(NAME # "_pre")
-                 (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>;
+                 (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), ARMVCCThen, VCCR:$pred))>;
   }
 }
 
@@ -5116,87 +5152,87 @@ def : MVEInstAlias<"vpsel${vp}." # suffix # "\t$Qd, $Qn, $Qm",
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
-            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
-            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
-            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
-            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
-            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v16i8 (vselect (v16i8 MQPR:$pred), (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))),
-            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
-                              (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), 1)))>;
+            (v16i8 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+                              (MVE_VCMPi8 (v16i8 MQPR:$pred), (MVE_VMOVimmi8 0), ARMCCne)))>;
   def : Pat<(v8i16 (vselect (v8i16 MQPR:$pred), (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))),
-            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
-                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+            (v8i16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
   def : Pat<(v4i32 (vselect (v4i32 MQPR:$pred), (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))),
-            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
-                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+            (v4i32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
 
   def : Pat<(v8f16 (vselect (v8i16 MQPR:$pred), (v8f16 MQPR:$v1), (v8f16 MQPR:$v2))),
-            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
-                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), 1)))>;
+            (v8f16 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+                              (MVE_VCMPi16 (v8i16 MQPR:$pred), (MVE_VMOVimmi16 0), ARMCCne)))>;
   def : Pat<(v4f32 (vselect (v4i32 MQPR:$pred), (v4f32 MQPR:$v1), (v4f32 MQPR:$v2))),
-            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, 0,
-                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), 1)))>;
+            (v4f32 (MVE_VPSEL MQPR:$v1, MQPR:$v2, ARMVCCNone,
+                              (MVE_VCMPi32 (v4i32 MQPR:$pred), (MVE_VMOVimmi32 0), ARMCCne)))>;
 
   // Pred <-> Int
   def : Pat<(v16i8 (zext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v8i16 (zext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4i32 (zext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v16i8 (sext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v8i16 (sext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4i32 (sext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi8 255), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v16i8 (anyext  (v16i1 VCCR:$pred))),
-            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), 0, VCCR:$pred))>;
+            (v16i8 (MVE_VPSEL (MVE_VMOVimmi8 1), (MVE_VMOVimmi8 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v8i16 (anyext  (v8i1  VCCR:$pred))),
-            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), 0, VCCR:$pred))>;
+            (v8i16 (MVE_VPSEL (MVE_VMOVimmi16 1), (MVE_VMOVimmi16 0), ARMVCCNone, VCCR:$pred))>;
   def : Pat<(v4i32 (anyext  (v4i1  VCCR:$pred))),
-            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), 0, VCCR:$pred))>;
+            (v4i32 (MVE_VPSEL (MVE_VMOVimmi32 1), (MVE_VMOVimmi32 0), ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v16i1 (trunc (v16i8 MQPR:$v1))),
-            (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, 1))>;
+            (v16i1 (MVE_VCMPi32r (v16i8 MQPR:$v1), ZR, ARMCCne))>;
   def : Pat<(v8i1 (trunc (v8i16  MQPR:$v1))),
-            (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, 1))>;
+            (v8i1 (MVE_VCMPi32r (v8i16 MQPR:$v1), ZR, ARMCCne))>;
   def : Pat<(v4i1 (trunc (v4i32  MQPR:$v1))),
-            (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, 1))>;
+            (v4i1 (MVE_VCMPi32r (v4i32 MQPR:$v1), ZR, ARMCCne))>;
 }
 
 let Predicates = [HasMVEFloat] in {
   // Pred <-> Float
   // 112 is 1.0 in float
   def : Pat<(v4f32 (uint_to_fp (v4i1 VCCR:$pred))),
-            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 112)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
   // 2620 in 1.0 in half
   def : Pat<(v8f16 (uint_to_fp (v8i1 VCCR:$pred))),
-            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2620)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
   // 240 is -1.0 in float
   def : Pat<(v4f32 (sint_to_fp (v4i1 VCCR:$pred))),
-            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), 0, VCCR:$pred))>;
+            (v4f32 (MVE_VPSEL (v4f32 (MVE_VMOVimmf32 240)), (v4f32 (MVE_VMOVimmi32 0)), ARMVCCNone, VCCR:$pred))>;
   // 2748 is -1.0 in half
   def : Pat<(v8f16 (sint_to_fp (v8i1 VCCR:$pred))),
-            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), 0, VCCR:$pred))>;
+            (v8f16 (MVE_VPSEL (v8f16 (MVE_VMOVimmi16 2748)), (v8f16 (MVE_VMOVimmi16 0)), ARMVCCNone, VCCR:$pred))>;
 
   def : Pat<(v4i1 (fp_to_uint (v4f32 MQPR:$v1))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
   def : Pat<(v8i1 (fp_to_uint (v8f16 MQPR:$v1))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
   def : Pat<(v4i1 (fp_to_sint (v4f32 MQPR:$v1))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, 1))>;
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, ARMCCne))>;
   def : Pat<(v8i1 (fp_to_sint (v8f16 MQPR:$v1))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, 1))>;
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, ARMCCne))>;
 }
 
 def MVE_VPNOT : MVE_p<(outs VCCR:$P0), (ins VCCR:$P0_in), NoItinerary,
@@ -5288,61 +5324,7 @@ def MVE_LCTP : MVE_loltp_end<(outs), (ins pred:$p), "lctp${p}", ""> {
 // Patterns
 //===----------------------------------------------------------------------===//
 
-class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
-                             PatFrag StoreKind, int shift>
-  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
-        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
-class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
-                                   PatFrag StoreKind, int shift>
-  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
-        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;
-
-multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
-                            int shift> {
-  def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
-
-class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
-                            PatFrag LoadKind, int shift>
-  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
-        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
-class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
-                                  PatFrag LoadKind, int shift>
-  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
-        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;
-
-multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
-                           int shift> {
-  def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
-  def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
-}
-
-class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
-                                    PatFrag StoreKind, int shift>
-  : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
-        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
-
-multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
-                                   int shift> {
-  def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
-  def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
-}
+// PatFrags for loads and stores. Often trying to keep semi-consistent names.
 
 def aligned32_pre_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
                                   (pre_store node:$val, node:$ptr, node:$offset), [{
@@ -5362,77 +5344,249 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
 }]>;
 
 
-def maskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                          (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                    (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
   auto *Ld = cast<MaskedLoadSDNode>(N);
   return Ld->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
-def sextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                              (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_sextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                        (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
   return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
 }]>;
-def zextmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                              (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_zextmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                        (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
   return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
 }]>;
-def extmaskedload8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                             (maskedload8 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_extmaskedloadvi8 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                       (aligned_maskedloadvi8 node:$ptr, node:$pred, node:$passthru), [{
   auto *Ld = cast<MaskedLoadSDNode>(N);
   EVT ScalarVT = Ld->getMemoryVT().getScalarType();
   return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
 }]>;
-def alignedmaskedload16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                                 (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi16: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                    (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
   auto *Ld = cast<MaskedLoadSDNode>(N);
   EVT ScalarVT = Ld->getMemoryVT().getScalarType();
   return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && Ld->getAlignment() >= 2;
 }]>;
-def sextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                               (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_sextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                         (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
   return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
 }]>;
-def zextmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                               (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_zextmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                         (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
   return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD;
 }]>;
-def extmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                              (alignedmaskedload16 node:$ptr, node:$pred, node:$passthru), [{
+def aligned_extmaskedloadvi16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                        (aligned_maskedloadvi16 node:$ptr, node:$pred, node:$passthru), [{
   auto *Ld = cast<MaskedLoadSDNode>(N);
   EVT ScalarVT = Ld->getMemoryVT().getScalarType();
   return ScalarVT.isInteger() && Ld->getExtensionType() == ISD::EXTLOAD;
 }]>;
-def alignedmaskedload32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
-                                 (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+def aligned_maskedloadvi32: PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                    (masked_ld node:$ptr, undef, node:$pred, node:$passthru), [{
   auto *Ld = cast<MaskedLoadSDNode>(N);
   EVT ScalarVT = Ld->getMemoryVT().getScalarType();
   return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && Ld->getAlignment() >= 4;
 }]>;
 
-def maskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
-                           (masked_st node:$val, node:$ptr, node:$pred), [{
+def aligned_maskedstvi8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                  (masked_st node:$val, node:$ptr, undef, node:$pred), [{
   return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
-def truncatingmaskedstore8 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
-                                     (maskedstore8 node:$val, node:$ptr, node:$pred), [{
-  return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+def aligned_maskedstvi16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
 }]>;
-def maskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
-                            (masked_st node:$val, node:$ptr, node:$pred), [{
+def aligned_maskedstvi32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
+
+def pre_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+                              (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+  ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+  return AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+}]>;
+def post_maskedstore : PatFrag<(ops node:$val, node:$base, node:$offset, node:$mask),
+                               (masked_st node:$val, node:$base, node:$offset, node:$mask), [{
+  ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+  return AM == ISD::POST_INC || AM == ISD::POST_DEC;
+}]>;
+def aligned_pre_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                         (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_post_maskedstorevi8 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                          (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_pre_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                          (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def aligned_post_maskedstorevi16 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                           (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
   auto *St = cast<MaskedStoreSDNode>(N);
   EVT ScalarVT = St->getMemoryVT().getScalarType();
   return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
 }]>;
+def aligned_pre_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                          (pre_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
+def aligned_post_maskedstorevi32 : PatFrag<(ops node:$val, node:$ptr, node:$offset, node:$mask),
+                                           (post_maskedstore node:$val, node:$ptr, node:$offset, node:$mask), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+}]>;
+
+
+// PatFrags for "Aligned" extending / truncating
+
+def aligned_extloadvi8  : PatFrag<(ops node:$ptr), (extloadvi8 node:$ptr)>;
+def aligned_sextloadvi8 : PatFrag<(ops node:$ptr), (sextloadvi8 node:$ptr)>;
+def aligned_zextloadvi8 : PatFrag<(ops node:$ptr), (zextloadvi8 node:$ptr)>;
+
+def aligned_truncstvi8 : PatFrag<(ops node:$val, node:$ptr),
+                                 (truncstorevi8 node:$val, node:$ptr)>;
+def aligned_post_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                      (post_truncstvi8 node:$val, node:$base, node:$offset)>;
+def aligned_pre_truncstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                     (pre_truncstvi8 node:$val, node:$base, node:$offset)>;
 
-def truncatingmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
-                                      (maskedstore16 node:$val, node:$ptr, node:$pred), [{
+let MinAlignment = 2 in {
+  def aligned_extloadvi16  : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
+  def aligned_sextloadvi16 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
+  def aligned_zextloadvi16 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
+
+  def aligned_truncstvi16 : PatFrag<(ops node:$val, node:$ptr),
+                                    (truncstorevi16 node:$val, node:$ptr)>;
+  def aligned_post_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                         (post_truncstvi16 node:$val, node:$base, node:$offset)>;
+  def aligned_pre_truncstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset),
+                                        (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
+}
+
+def truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$pred),
+                            (masked_st node:$val, node:$base, undef, node:$pred), [{
   return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
 }]>;
-def maskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
-                            (masked_st node:$val, node:$ptr, node:$pred), [{
+def aligned_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$pred),
+                                       (truncmaskedst node:$val, node:$base, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$pred),
+                                        (truncmaskedst node:$val, node:$base, node:$pred), [{
   auto *St = cast<MaskedStoreSDNode>(N);
   EVT ScalarVT = St->getMemoryVT().getScalarType();
-  return (ScalarVT == MVT::i32 || ScalarVT == MVT::f32) && St->getAlignment() >= 4;
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def pre_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+                                (masked_st node:$val, node:$base, node:$offset, node:$pred), [{
+  ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+  return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::PRE_INC || AM == ISD::PRE_DEC);
+}]>;
+def aligned_pre_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+                                           (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def aligned_pre_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$pred),
+                                            (pre_truncmaskedst node:$val, node:$base, node:$offset, node:$pred), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+def post_truncmaskedst : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+                                 (masked_st node:$val, node:$base, node:$offset, node:$postd), [{
+  ISD::MemIndexedMode AM = cast<MaskedStoreSDNode>(N)->getAddressingMode();
+  return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && (AM == ISD::POST_INC || AM == ISD::POST_DEC);
+}]>;
+def aligned_post_truncmaskedstvi8 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+                                            (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+  return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
+def aligned_post_truncmaskedstvi16 : PatFrag<(ops node:$val, node:$base, node:$offset, node:$postd),
+                                             (post_truncmaskedst node:$val, node:$base, node:$offset, node:$postd), [{
+  auto *St = cast<MaskedStoreSDNode>(N);
+  EVT ScalarVT = St->getMemoryVT().getScalarType();
+  return (ScalarVT == MVT::i16 || ScalarVT == MVT::f16) && St->getAlignment() >= 2;
+}]>;
+
+// Load/store patterns
+
+class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
+                             PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+                                   PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
+                            int shift> {
+  def : MVE_vector_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
+class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
+                            PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+                                  PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+
+multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
+                           int shift> {
+  def : MVE_vector_load_typed<v16i8, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v8i16, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v8f16, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v4i32, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v4f32, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v2i64, RegImmInst, LoadKind, shift>;
+  def : MVE_vector_load_typed<v2f64, RegImmInst, LoadKind, shift>;
+}
+
+class MVE_vector_offset_store_typed<ValueType Ty, Instruction Opcode,
+                                    PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr),
+        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr)>;
+
+class MVE_vector_offset_maskedstore_typed<ValueType Ty, Instruction Opcode,
+                                          PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<shift>:$addr, VCCR:$pred),
+        (Opcode MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+multiclass MVE_vector_offset_store<Instruction RegImmInst, PatFrag StoreKind,
+                                   int shift> {
+  def : MVE_vector_offset_store_typed<v16i8, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v8i16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v8f16, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v4i32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v4f32, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v2i64, RegImmInst, StoreKind, shift>;
+  def : MVE_vector_offset_store_typed<v2f64, RegImmInst, StoreKind, shift>;
+}
+
 
 let Predicates = [HasMVEInt, IsLE] in {
   // Stores
@@ -5510,116 +5664,73 @@ let Predicates = [HasMVEInt, IsBE] in {
 
 let Predicates = [HasMVEInt] in {
   // Aligned masked store, shared between LE and BE
-  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore8, 0>;
-  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, maskedstore16, 1>;
-  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, maskedstore16, 1>;
-  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, maskedstore32, 2>;
-  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, maskedstore32, 2>;
-  // Truncating stores
-  def : Pat<(truncatingmaskedstore8 (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
-            (MVE_VSTRB16 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
-  def : Pat<(truncatingmaskedstore8 (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
-            (MVE_VSTRB32 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
-  def : Pat<(truncatingmaskedstore16 (v4i32 MQPR:$val), t2addrmode_imm7<1>:$addr, VCCR:$pred),
-            (MVE_VSTRH32 MQPR:$val, t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred)>;
+  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, aligned_maskedstvi8, 0>;
+  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
+  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, aligned_maskedstvi16, 1>;
+  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
+  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, aligned_maskedstvi32, 2>;
+
+  // Pre/Post inc masked stores
+  def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_pre, aligned_pre_maskedstorevi8, 0>;
+  def : MVE_vector_offset_maskedstore_typed<v16i8, MVE_VSTRBU8_post, aligned_post_maskedstorevi8, 0>;
+  def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
+  def : MVE_vector_offset_maskedstore_typed<v8i16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
+  def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_pre, aligned_pre_maskedstorevi16, 1>;
+  def : MVE_vector_offset_maskedstore_typed<v8f16, MVE_VSTRHU16_post, aligned_post_maskedstorevi16, 1>;
+  def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
+  def : MVE_vector_offset_maskedstore_typed<v4i32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
+  def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_pre, aligned_pre_maskedstorevi32, 2>;
+  def : MVE_vector_offset_maskedstore_typed<v4f32, MVE_VSTRWU32_post, aligned_post_maskedstorevi32, 2>;
+
   // Aligned masked loads
-  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload8, 0>;
-  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
-  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
-  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
-  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
-  // Extending masked loads.
-  def : Pat<(v8i16 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v8i16 NEONimmAllZerosV))),
-            (v8i16 (MVE_VLDRBS16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (sextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRBS32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v8i16 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v8i16 NEONimmAllZerosV))),
-            (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (zextmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v8i16 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v8i16 NEONimmAllZerosV))),
-            (v8i16 (MVE_VLDRBU16 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (extmaskedload8 t2addrmode_imm7<0>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRBU32 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (sextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRHS32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (zextmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
-  def : Pat<(v4i32 (extmaskedload16 t2addrmode_imm7<1>:$addr, VCCR:$pred,
-                    (v4i32 NEONimmAllZerosV))),
-            (v4i32 (MVE_VLDRHU32 t2addrmode_imm7<1>:$addr, (i32 1), VCCR:$pred))>;
+  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, aligned_maskedloadvi8, 0>;
+  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
+  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, aligned_maskedloadvi16, 1>;
+  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
+  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, aligned_maskedloadvi32, 2>;
 }
 
 // Widening/Narrowing Loads/Stores
 
-let MinAlignment = 2 in {
-  def truncstorevi16_align2 : PatFrag<(ops node:$val, node:$ptr),
-                                      (truncstorevi16 node:$val, node:$ptr)>;
-  def post_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
-                                        (post_truncstvi16 node:$val, node:$base, node:$offset)>;
-  def pre_truncstvi16_align2 : PatFrag<(ops node:$val, node:$base, node:$offset),
-                                       (pre_truncstvi16 node:$val, node:$base, node:$offset)>;
-}
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(truncstorevi8 (v8i16 MQPR:$val), taddrmode_imm7<0>:$addr),
-            (MVE_VSTRB16 MQPR:$val, taddrmode_imm7<0>:$addr)>;
-  def : Pat<(truncstorevi8 (v4i32 MQPR:$val), taddrmode_imm7<0>:$addr),
-            (MVE_VSTRB32 MQPR:$val, taddrmode_imm7<0>:$addr)>;
-  def : Pat<(truncstorevi16_align2 (v4i32 MQPR:$val), taddrmode_imm7<1>:$addr),
-            (MVE_VSTRH32 MQPR:$val, taddrmode_imm7<1>:$addr)>;
-
-  def : Pat<(post_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
-            (MVE_VSTRB16_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
-  def : Pat<(post_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
-            (MVE_VSTRB32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
-  def : Pat<(post_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
-            (MVE_VSTRH32_post MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
-
-  def : Pat<(pre_truncstvi8 (v8i16 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
-            (MVE_VSTRB16_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
-  def : Pat<(pre_truncstvi8 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<0>:$addr),
-            (MVE_VSTRB32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<0>:$addr)>;
-  def : Pat<(pre_truncstvi16_align2 (v4i32 MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<1>:$addr),
-            (MVE_VSTRH32_pre MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<1>:$addr)>;
-}
-
-
-let MinAlignment = 2 in {
-  def extloadvi16_align2  : PatFrag<(ops node:$ptr), (extloadvi16 node:$ptr)>;
-  def sextloadvi16_align2 : PatFrag<(ops node:$ptr), (sextloadvi16 node:$ptr)>;
-  def zextloadvi16_align2 : PatFrag<(ops node:$ptr), (zextloadvi16 node:$ptr)>;
-}
-
-multiclass MVEExtLoad<string DestLanes, string DestElemBits,
-                      string SrcElemBits, string SrcElemType,
-                      string Align, Operand am> {
-  def _Any : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("extloadvi"  # SrcElemBits # Align) am:$addr)),
-                 (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
-                   am:$addr)>;
-  def _Z   : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("zextloadvi"  # SrcElemBits # Align) am:$addr)),
-                 (!cast<Instruction>("MVE_VLDR" # SrcElemType # "U" # DestElemBits)
-                   am:$addr)>;
-  def _S   : Pat<(!cast<ValueType>("v" # DestLanes # "i" # DestElemBits)
-                   (!cast<PatFrag>("sextloadvi"  # SrcElemBits # Align) am:$addr)),
-                 (!cast<Instruction>("MVE_VLDR" # SrcElemType # "S" # DestElemBits)
-                   am:$addr)>;
+multiclass MVEExtLoadStore<Instruction LoadSInst, Instruction LoadUInst, string StoreInst,
+                         string Amble, ValueType VT, int Shift> {
+  // Trunc stores
+  def : Pat<(!cast<PatFrag>("aligned_truncst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr),
+            (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr)>;
+  def : Pat<(!cast<PatFrag>("aligned_post_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
+            (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
+  def : Pat<(!cast<PatFrag>("aligned_pre_truncst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr),
+            (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr)>;
+
+  // Masked trunc stores
+  def : Pat<(!cast<PatFrag>("aligned_truncmaskedst"#Amble) (VT MQPR:$val), taddrmode_imm7<Shift>:$addr, VCCR:$pred),
+            (!cast<Instruction>(StoreInst) MQPR:$val, taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+  def : Pat<(!cast<PatFrag>("aligned_post_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
+            (!cast<Instruction>(StoreInst#"_post") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+  def : Pat<(!cast<PatFrag>("aligned_pre_truncmaskedst"#Amble) (VT MQPR:$Rt), tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, VCCR:$pred),
+            (!cast<Instruction>(StoreInst#"_pre") MQPR:$Rt, tGPR:$Rn, t2am_imm7_offset<Shift>:$addr, ARMVCCThen, VCCR:$pred)>;
+
+  // Ext loads
+  def : Pat<(VT (!cast<PatFrag>("aligned_extload"#Amble) taddrmode_imm7<Shift>:$addr)),
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
+  def : Pat<(VT (!cast<PatFrag>("aligned_sextload"#Amble) taddrmode_imm7<Shift>:$addr)),
+            (VT (LoadSInst taddrmode_imm7<Shift>:$addr))>;
+  def : Pat<(VT (!cast<PatFrag>("aligned_zextload"#Amble) taddrmode_imm7<Shift>:$addr)),
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr))>;
+
+  // Masked ext loads
+  def : Pat<(VT (!cast<PatFrag>("aligned_extmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+  def : Pat<(VT (!cast<PatFrag>("aligned_sextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+            (VT (LoadSInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
+  def : Pat<(VT (!cast<PatFrag>("aligned_zextmaskedload"#Amble) taddrmode_imm7<Shift>:$addr, VCCR:$pred, (VT NEONimmAllZerosV))),
+            (VT (LoadUInst taddrmode_imm7<Shift>:$addr, ARMVCCThen, VCCR:$pred))>;
 }
 
 let Predicates = [HasMVEInt] in {
-  defm : MVEExtLoad<"4", "32", "8",  "B", "", taddrmode_imm7<0>>;
-  defm : MVEExtLoad<"8", "16", "8",  "B", "", taddrmode_imm7<0>>;
-  defm : MVEExtLoad<"4", "32", "16", "H", "_align2", taddrmode_imm7<1>>;
+  defm : MVEExtLoadStore<MVE_VLDRBS16, MVE_VLDRBU16, "MVE_VSTRB16", "vi8", v8i16, 0>;
+  defm : MVEExtLoadStore<MVE_VLDRBS32, MVE_VLDRBU32, "MVE_VSTRB32", "vi8", v4i32, 0>;
+  defm : MVEExtLoadStore<MVE_VLDRHS32, MVE_VLDRHU32, "MVE_VSTRH32", "vi16", v4i32, 1>;
 }
 
 
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 94bb45bde5739..6244d8d9e27e5 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -3314,30 +3314,30 @@ class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
 //   source operand element sizes of 8, 16 and 32 bits:
 multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
                        bits<5> op11_7, bit op4, string opc, string Dt,
-                       string asm, int fc> {
+                       string asm, PatFrag fc> {
   // 64-bit vector types.
   def v8i8  : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "8"), asm, "",
-                  [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), (i32 fc))))]>;
+                  [(set DPR:$Vd, (v8i8 (ARMvcmpz (v8i8 DPR:$Vm), fc)))]>;
   def v4i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "16"), asm, "",
-                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), (i32 fc))))]>;
+                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4i16 DPR:$Vm), fc)))]>;
   def v2i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "32"), asm, "",
-                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), (i32 fc))))]>;
+                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2i32 DPR:$Vm), fc)))]>;
   def v2f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, "f32", asm, "",
-                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), (i32 fc))))]> {
+                  [(set DPR:$Vd, (v2i32 (ARMvcmpz (v2f32 DPR:$Vm), fc)))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
   def v4f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 0, op4,
                   (outs DPR:$Vd), (ins DPR:$Vm), NoItinerary,
                   opc, "f16", asm, "",
-                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), (i32 fc))))]>,
+                  [(set DPR:$Vd, (v4i16 (ARMvcmpz (v4f16 DPR:$Vm), fc)))]>,
               Requires<[HasNEON,HasFullFP16]> {
     let Inst{10} = 1; // overwrite F = 1
   }
@@ -3346,25 +3346,25 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
   def v16i8 : N2V<op24_23, op21_20, 0b00, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "8"), asm, "",
-                  [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), (i32 fc))))]>;
+                  [(set QPR:$Vd, (v16i8 (ARMvcmpz (v16i8 QPR:$Vm), fc)))]>;
   def v8i16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "16"), asm, "",
-                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), (i32 fc))))]>;
+                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8i16 QPR:$Vm), fc)))]>;
   def v4i32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, !strconcat(Dt, "32"), asm, "",
-                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), (i32 fc))))]>;
+                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4i32 QPR:$Vm), fc)))]>;
   def v4f32 : N2V<op24_23, op21_20, 0b10, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, "f32", asm, "",
-                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), (i32 fc))))]> {
+                  [(set QPR:$Vd, (v4i32 (ARMvcmpz (v4f32 QPR:$Vm), fc)))]> {
     let Inst{10} = 1; // overwrite F = 1
   }
   def v8f16 : N2V<op24_23, op21_20, 0b01, op17_16, op11_7, 1, op4,
                   (outs QPR:$Vd), (ins QPR:$Vm), NoItinerary,
                   opc, "f16", asm, "",
-                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), (i32 fc))))]>,
+                  [(set QPR:$Vd, (v8i16 (ARMvcmpz (v8f16 QPR:$Vm), fc)))]>,
               Requires<[HasNEON,HasFullFP16]> {
     let Inst{10} = 1; // overwrite F = 1
   }
@@ -3373,11 +3373,11 @@ multiclass N2V_QHS_cmp<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
 // Neon 3-register comparisons.
 class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+               ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 1, op4,
         (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
-        [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), (i32 fc))))]> {
+        [(set QPR:$Vd, (ResTy (ARMvcmp (OpTy QPR:$Vn), (OpTy QPR:$Vm), fc)))]> {
   // All of these have a two-operand InstAlias.
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
@@ -3385,11 +3385,11 @@ class N3VQ_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
 
 class N3VD_cmp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
                InstrItinClass itin, string OpcodeStr, string Dt,
-               ValueType ResTy, ValueType OpTy, int fc, bit Commutable>
+               ValueType ResTy, ValueType OpTy, PatFrag fc, bit Commutable>
   : N3V<op24, op23, op21_20, op11_8, 0, op4,
         (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
         OpcodeStr, Dt, "$Vd, $Vn, $Vm", "",
-        [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), (i32 fc))))]> {
+        [(set DPR:$Vd, (ResTy (ARMvcmp (OpTy DPR:$Vn), (OpTy DPR:$Vm), fc)))]> {
   // All of these have a two-operand InstAlias.
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
@@ -3399,7 +3399,7 @@ multiclass N3V_QHS_cmp<bit op24, bit op23, bits<4> op11_8, bit op4,
                        InstrItinClass itinD16, InstrItinClass itinD32,
                        InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt,
-                       int fc, bit Commutable = 0> {
+                       PatFrag fc, bit Commutable = 0> {
   // 64-bit vector types.
   def v8i8  : N3VD_cmp<op24, op23, 0b00, op11_8, op4, itinD16,
                        OpcodeStr, !strconcat(Dt, "8"),
@@ -4287,10 +4287,10 @@ defm VRHADDu  : N3VInt_QHS<1, 0, 0b0001, 0, N3RegFrm,
 //   VQADD    : Vector Saturating Add
 defm VQADDs   : N3VInt_QHSD<0, 0, 0b0000, 1, N3RegFrm,
                             IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
-                            "vqadd", "s", int_arm_neon_vqadds, 1>;
+                            "vqadd", "s", saddsat, 1>;
 defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
                             IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
-                            "vqadd", "u", int_arm_neon_vqaddu, 1>;
+                            "vqadd", "u", uaddsat, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
 defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
 //   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
@@ -4527,22 +4527,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
                              null_frag>;
-  def : Pat<(v4i16 (int_arm_neon_vqadds
+  def : Pat<(v4i16 (saddsat
                      (v4i16 DPR:$src1),
                      (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
                                                    (v4i16 DPR:$Vm))))),
             (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v2i32 (int_arm_neon_vqadds
+  def : Pat<(v2i32 (saddsat
                      (v2i32 DPR:$src1),
                      (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
                                                    (v2i32 DPR:$Vm))))),
             (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v8i16 (int_arm_neon_vqadds
+  def : Pat<(v8i16 (saddsat
                      (v8i16 QPR:$src1),
                      (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
                                                    (v8i16 QPR:$Vm))))),
             (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
-  def : Pat<(v4i32 (int_arm_neon_vqadds
+  def : Pat<(v4i32 (saddsat
                      (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
                                                    (v4i32 QPR:$Vm))))),
@@ -4551,7 +4551,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
                                   IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
                                   null_frag>;
-  def : Pat<(v4i16 (int_arm_neon_vqadds
+  def : Pat<(v4i16 (saddsat
                      (v4i16 DPR:$src1),
                      (v4i16 (int_arm_neon_vqrdmulh
                               (v4i16 DPR:$Vn),
@@ -4559,7 +4559,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                                    imm:$lane)))))),
             (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
                                     imm:$lane))>;
-  def : Pat<(v2i32 (int_arm_neon_vqadds
+  def : Pat<(v2i32 (saddsat
                      (v2i32 DPR:$src1),
                      (v2i32 (int_arm_neon_vqrdmulh
                               (v2i32 DPR:$Vn),
@@ -4567,7 +4567,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                                    imm:$lane)))))),
             (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
                                     imm:$lane))>;
-  def : Pat<(v8i16 (int_arm_neon_vqadds
+  def : Pat<(v8i16 (saddsat
                      (v8i16 QPR:$src1),
                      (v8i16 (int_arm_neon_vqrdmulh
                               (v8i16 QPR:$src2),
@@ -4579,7 +4579,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                              QPR:$src3,
                                              (DSubReg_i16_reg imm:$lane))),
                                     (SubReg_i16_lane imm:$lane)))>;
-  def : Pat<(v4i32 (int_arm_neon_vqadds
+  def : Pat<(v4i32 (saddsat
                      (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqrdmulh 
                               (v4i32 QPR:$src2),
@@ -4597,22 +4597,22 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
                              null_frag>;
-  def : Pat<(v4i16 (int_arm_neon_vqsubs
+  def : Pat<(v4i16 (ssubsat
                      (v4i16 DPR:$src1),
                      (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
                                                    (v4i16 DPR:$Vm))))),
             (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v2i32 (int_arm_neon_vqsubs
+  def : Pat<(v2i32 (ssubsat
                      (v2i32 DPR:$src1),
                      (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
                                                    (v2i32 DPR:$Vm))))),
             (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v8i16 (int_arm_neon_vqsubs
+  def : Pat<(v8i16 (ssubsat
                      (v8i16 QPR:$src1),
                      (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
                                                    (v8i16 QPR:$Vm))))),
             (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
-  def : Pat<(v4i32 (int_arm_neon_vqsubs
+  def : Pat<(v4i32 (ssubsat
                      (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
                                                    (v4i32 QPR:$Vm))))),
@@ -4621,14 +4621,14 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
                                   IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
                                   null_frag>;
-  def : Pat<(v4i16 (int_arm_neon_vqsubs
+  def : Pat<(v4i16 (ssubsat
                      (v4i16 DPR:$src1),
                      (v4i16 (int_arm_neon_vqrdmulh
                               (v4i16 DPR:$Vn),
                               (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
                                                    imm:$lane)))))),
             (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
-  def : Pat<(v2i32 (int_arm_neon_vqsubs
+  def : Pat<(v2i32 (ssubsat
                      (v2i32 DPR:$src1),
                      (v2i32 (int_arm_neon_vqrdmulh
                               (v2i32 DPR:$Vn),
@@ -4636,7 +4636,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                                    imm:$lane)))))),
             (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, 
                                     imm:$lane))>;
-  def : Pat<(v8i16 (int_arm_neon_vqsubs
+  def : Pat<(v8i16 (ssubsat
                      (v8i16 QPR:$src1),
                      (v8i16 (int_arm_neon_vqrdmulh
                               (v8i16 QPR:$src2),
@@ -4648,7 +4648,7 @@ let Predicates = [HasNEON, HasV8_1a] in {
                                              QPR:$src3,
                                              (DSubReg_i16_reg imm:$lane))),
                                     (SubReg_i16_lane imm:$lane)))>;
-  def : Pat<(v4i32 (int_arm_neon_vqsubs
+  def : Pat<(v4i32 (ssubsat
                      (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqrdmulh
                               (v4i32 QPR:$src2),
@@ -4667,20 +4667,20 @@ defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
 defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
 
 let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
                                                   (v4i16 DPR:$Vm))))),
           (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
                      (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
                                                   (v2i32 DPR:$Vm))))),
           (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+def : Pat<(v4i32 (saddsat (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
                                 (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
                                                      imm:$lane)))))),
           (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+def : Pat<(v2i64 (saddsat (v2i64 QPR:$src1),
                      (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
                                 (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
                                                      imm:$lane)))))),
@@ -4759,20 +4759,20 @@ defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
 defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b0111, "vqdmlsl", "s", null_frag>;
 
 let Predicates = [HasNEON] in {
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
                                                   (v4i16 DPR:$Vm))))),
           (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
                      (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
                                                   (v2i32 DPR:$Vm))))),
           (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
-def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+def : Pat<(v4i32 (ssubsat (v4i32 QPR:$src1),
                      (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
                                 (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
                                                      imm:$lane)))))),
           (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
-def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+def : Pat<(v2i64 (ssubsat (v2i64 QPR:$src1),
                      (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
                                 (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
                                                      imm:$lane)))))),
@@ -5012,6 +5012,27 @@ defm VCMLA : N3VCP8ComplexTied<1, 0, "vcmla", null_frag>;
 defm VCADD : N3VCP8ComplexOdd<1, 0, 0, "vcadd", null_frag>;
 defm VCMLA : N3VCP8ComplexTiedLane<0, "vcmla", null_frag>;
 
+let Predicates = [HasNEON,HasV8_3a,HasFullFP16] in {
+  def : Pat<(v4f16 (int_arm_neon_vcadd_rot90 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+            (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 0))>;
+  def : Pat<(v4f16 (int_arm_neon_vcadd_rot270 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm))),
+            (VCADDv4f16 (v4f16 DPR:$Rn), (v4f16 DPR:$Rm), (i32 1))>;
+  def : Pat<(v8f16 (int_arm_neon_vcadd_rot90 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+            (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 0))>;
+  def : Pat<(v8f16 (int_arm_neon_vcadd_rot270 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm))),
+            (VCADDv8f16 (v8f16 QPR:$Rn), (v8f16 QPR:$Rm), (i32 1))>;
+}
+let Predicates = [HasNEON,HasV8_3a] in {
+  def : Pat<(v2f32 (int_arm_neon_vcadd_rot90 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+            (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 0))>;
+  def : Pat<(v2f32 (int_arm_neon_vcadd_rot270 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm))),
+            (VCADDv2f32 (v2f32 DPR:$Rn), (v2f32 DPR:$Rm), (i32 1))>;
+  def : Pat<(v4f32 (int_arm_neon_vcadd_rot90 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+            (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 0))>;
+  def : Pat<(v4f32 (int_arm_neon_vcadd_rot270 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm))),
+            (VCADDv4f32 (v4f32 QPR:$Rn), (v4f32 QPR:$Rm), (i32 1))>;
+}
+
 // Vector Subtract Operations.
 
 //   VSUB     : Vector Subtract (integer and floating-point)
@@ -5045,10 +5066,10 @@ defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, N3RegFrm,
 //   VQSUB    : Vector Saturing Subtract
 defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                            "vqsub", "s", int_arm_neon_vqsubs, 0>;
+                            "vqsub", "s", ssubsat, 0>;
 defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
-                            "vqsub", "u", int_arm_neon_vqsubu, 0>;
+                            "vqsub", "u", usubsat, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
 defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
 //   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
@@ -5068,66 +5089,66 @@ def : Pat<(v2i32 (trunc (ARMvshruImm (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
 
 //   VCEQ     : Vector Compare Equal
 defm VCEQ     : N3V_QHS_cmp<1, 0, 0b1000, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                            IIC_VSUBi4Q, "vceq", "i", 0, 1>;
+                            IIC_VSUBi4Q, "vceq", "i", ARMCCeq, 1>;
 def  VCEQfd   : N3VD_cmp<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
-                         0, 1>;
+                         ARMCCeq, 1>;
 def  VCEQfq   : N3VQ_cmp<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
-                         0, 1>;
+                         ARMCCeq, 1>;
 def  VCEQhd   : N3VD_cmp<0,0,0b01,0b1110,0, IIC_VBIND, "vceq", "f16", v4i16, v4f16,
-                         0, 1>,
+                         ARMCCeq, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VCEQhq   : N3VQ_cmp<0,0,0b01,0b1110,0, IIC_VBINQ, "vceq", "f16", v8i16, v8f16,
-                         0, 1>,
+                         ARMCCeq, 1>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in
 defm VCEQz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00010, 0, "vceq", "i",
-                            "$Vd, $Vm, #0", 0>;
+                            "$Vd, $Vm, #0", ARMCCeq>;
 
 //   VCGE     : Vector Compare Greater Than or Equal
 defm VCGEs    : N3V_QHS_cmp<0, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                            IIC_VSUBi4Q, "vcge", "s", 10, 0>;
+                            IIC_VSUBi4Q, "vcge", "s", ARMCCge, 0>;
 defm VCGEu    : N3V_QHS_cmp<1, 0, 0b0011, 1, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                            IIC_VSUBi4Q, "vcge", "u", 2, 0>;
+                            IIC_VSUBi4Q, "vcge", "u", ARMCChs, 0>;
 def  VCGEfd   : N3VD_cmp<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32", v2i32, v2f32,
-                         10, 0>;
+                         ARMCCge, 0>;
 def  VCGEfq   : N3VQ_cmp<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
-                         10, 0>;
+                         ARMCCge, 0>;
 def  VCGEhd   : N3VD_cmp<1,0,0b01,0b1110,0, IIC_VBIND, "vcge", "f16", v4i16, v4f16,
-                         10, 0>,
+                         ARMCCge, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VCGEhq   : N3VQ_cmp<1,0,0b01,0b1110,0, IIC_VBINQ, "vcge", "f16", v8i16, v8f16,
-                         10, 0>,
+                         ARMCCge, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00001, 0, "vcge", "s",
-                            "$Vd, $Vm, #0", 10>;
+                            "$Vd, $Vm, #0", ARMCCge>;
 defm VCLEz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00011, 0, "vcle", "s",
-                            "$Vd, $Vm, #0", 13>;
+                            "$Vd, $Vm, #0", ARMCCle>;
 }
 
 //   VCGT     : Vector Compare Greater Than
 defm VCGTs    : N3V_QHS_cmp<0, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                            IIC_VSUBi4Q, "vcgt", "s", 12, 0>;
+                            IIC_VSUBi4Q, "vcgt", "s", ARMCCgt, 0>;
 defm VCGTu    : N3V_QHS_cmp<1, 0, 0b0011, 0, IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q,
-                            IIC_VSUBi4Q, "vcgt", "u", 8, 0>;
+                            IIC_VSUBi4Q, "vcgt", "u", ARMCChi, 0>;
 def  VCGTfd   : N3VD_cmp<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
-                         12, 0>;
+                         ARMCCgt, 0>;
 def  VCGTfq   : N3VQ_cmp<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
-                         12, 0>;
+                         ARMCCgt, 0>;
 def  VCGThd   : N3VD_cmp<1,0,0b11,0b1110,0, IIC_VBIND, "vcgt", "f16", v4i16, v4f16,
-                         12, 0>,
+                         ARMCCgt, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 def  VCGThq   : N3VQ_cmp<1,0,0b11,0b1110,0, IIC_VBINQ, "vcgt", "f16", v8i16, v8f16,
-                         12, 0>,
+                         ARMCCgt, 0>,
                 Requires<[HasNEON, HasFullFP16]>;
 
 let TwoOperandAliasConstraint = "$Vm = $Vd" in {
 defm VCGTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00000, 0, "vcgt", "s",
-                            "$Vd, $Vm, #0", 12>;
+                            "$Vd, $Vm, #0", ARMCCgt>;
 defm VCLTz    : N2V_QHS_cmp<0b11, 0b11, 0b01, 0b00100, 0, "vclt", "s",
-                            "$Vd, $Vm, #0", 11>;
+                            "$Vd, $Vm, #0", ARMCClt>;
 }
 
 //   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index fdd961bfbb2f7..90be9a0333ed3 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -2279,6 +2279,12 @@ def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
                     [(set (f32 SPR:$Sd),
                           (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
                RegConstraint<"$Sn = $Sd">, Requires<[HasFPRegs]>;
+
+def VMOVHcc  : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA16,
+                    [(set (f16 HPR:$Sd),
+                          (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>;
 } // hasSideEffects
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 733a3f166069f..756d0fdb55702 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -25,6 +25,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/MC/MCInstrDesc.h"
 
 using namespace llvm;
@@ -104,15 +106,45 @@ namespace {
     // Is it safe to define LR with DLS/WLS?
     // LR can be defined if it is the operand to start, because it's the same
     // value, or if it's going to be equivalent to the operand to Start.
-    MachineInstr *IsSafeToDefineLR();
+    MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
 
-    // Check the branch targets are within range and we satisfy our restructi
-    void CheckLegality(ARMBasicBlockUtils *BBUtils);
+    // Check the branch targets are within range and we satisfy our
+    // restrictions.
+    void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
+                       MachineLoopInfo *MLI);
 
     bool FoundAllComponents() const {
       return Start && Dec && End;
     }
 
+    // Return the loop iteration count, or the number of elements if we're tail
+    // predicating.
+    MachineOperand &getCount() {
+      return IsTailPredicationLegal() ?
+        VCTP->getOperand(1) : Start->getOperand(0);
+    }
+
+    unsigned getStartOpcode() const {
+      bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
+      if (!IsTailPredicationLegal())
+        return IsDo ? ARM::t2DLS : ARM::t2WLS;
+      
+      switch (VCTP->getOpcode()) {
+      default:
+        llvm_unreachable("unhandled vctp opcode");
+        break;
+      case ARM::MVE_VCTP8:
+        return IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
+      case ARM::MVE_VCTP16:
+        return IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
+      case ARM::MVE_VCTP32:
+        return IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
+      case ARM::MVE_VCTP64:
+        return IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
+      }
+      return 0;
+    }
+
     void dump() const {
       if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start;
       if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec;
@@ -127,6 +159,8 @@ namespace {
 
   class ARMLowOverheadLoops : public MachineFunctionPass {
     MachineFunction           *MF = nullptr;
+    MachineLoopInfo           *MLI = nullptr;
+    ReachingDefAnalysis       *RDA = nullptr;
     const ARMBaseInstrInfo    *TII = nullptr;
     MachineRegisterInfo       *MRI = nullptr;
     std::unique_ptr<ARMBasicBlockUtils> BBUtils = nullptr;
@@ -139,6 +173,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<ReachingDefAnalysis>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -146,7 +181,8 @@ namespace {
 
     MachineFunctionProperties getRequiredProperties() const override {
       return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs);
+          MachineFunctionProperties::Property::NoVRegs).set(
+          MachineFunctionProperties::Property::TracksLiveness);
     }
 
     StringRef getPassName() const override {
@@ -183,31 +219,6 @@ static bool IsLoopStart(MachineInstr &MI) {
          MI.getOpcode() == ARM::t2WhileLoopStart;
 }
 
-template<typename T>
-static MachineInstr* SearchForDef(MachineInstr *Begin, T End, unsigned Reg) {
-  for(auto &MI : make_range(T(Begin), End)) {
-    for (auto &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
-        continue;
-      return &MI;
-    }
-  }
-  return nullptr;
-}
-
-static MachineInstr* SearchForUse(MachineInstr *Begin,
-                                  MachineBasicBlock::iterator End,
-                                  unsigned Reg) {
-  for(auto &MI : make_range(MachineBasicBlock::iterator(Begin), End)) {
-    for (auto &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      return &MI;
-    }
-  }
-  return nullptr;
-}
-
 static bool IsVCTP(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
@@ -221,73 +232,42 @@ static bool IsVCTP(MachineInstr *MI) {
   return false;
 }
 
-MachineInstr *LowOverheadLoop::IsSafeToDefineLR() {
+MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
+  // We can define LR because LR already contains the same value.
+  if (Start->getOperand(0).getReg() == ARM::LR)
+    return Start;
 
-  auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) {
+  unsigned CountReg = Start->getOperand(0).getReg();
+  auto IsMoveLR = [&CountReg](MachineInstr *MI) {
     return MI->getOpcode() == ARM::tMOVr &&
            MI->getOperand(0).getReg() == ARM::LR &&
-           MI->getOperand(1).getReg() == Reg &&
+           MI->getOperand(1).getReg() == CountReg &&
            MI->getOperand(2).getImm() == ARMCC::AL;
    };
 
   MachineBasicBlock *MBB = Start->getParent();
-  unsigned CountReg = Start->getOperand(0).getReg();
-  // Walk forward and backward in the block to find the closest instructions
-  // that define LR. Then also filter them out if they're not a mov lr.
-  MachineInstr *PredLRDef = SearchForDef(Start, MBB->rend(), ARM::LR);
-  if (PredLRDef && !IsMoveLR(PredLRDef, CountReg))
-    PredLRDef = nullptr;
-
-  MachineInstr *SuccLRDef = SearchForDef(Start, MBB->end(), ARM::LR);
-  if (SuccLRDef && !IsMoveLR(SuccLRDef, CountReg))
-    SuccLRDef = nullptr;
-
-  // We've either found one, two or none mov lr instructions... Now figure out
-  // if they are performing the equilvant mov that the Start instruction will.
-  // Do this by scanning forward and backward to see if there's a def of the
-  // register holding the count value. If we find a suitable def, return it as
-  // the insert point. Later, if InsertPt != Start, then we can remove the
-  // redundant instruction.
-  if (SuccLRDef) {
-    MachineBasicBlock::iterator End(SuccLRDef);
-    if (!SearchForDef(Start, End, CountReg)) {
-      return SuccLRDef;
-    } else
-      SuccLRDef = nullptr;
-  }
-  if (PredLRDef) {
-    MachineBasicBlock::reverse_iterator End(PredLRDef);
-    if (!SearchForDef(Start, End, CountReg)) {
-      return PredLRDef;
-    } else
-      PredLRDef = nullptr;
-  }
 
-  // We can define LR because LR already contains the same value.
-  if (Start->getOperand(0).getReg() == ARM::LR)
-    return Start;
+  // Find an insertion point:
+  // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
+  //   to Count before Start, we can insert at that mov.
+  // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
+  //   to Count after Start, we can insert at that mov.
+  if (auto *LRDef = RDA->getReachingMIDef(&MBB->back(), ARM::LR)) {
+    if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+      return LRDef;
+  }
 
   // We've found no suitable LR def and Start doesn't use LR directly. Can we
-  // just define LR anyway? 
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-  LivePhysRegs LiveRegs(*TRI);
-  LiveRegs.addLiveOuts(*MBB);
-
-  // Not if we've haven't found a suitable mov and LR is live out.
-  if (LiveRegs.contains(ARM::LR))
-    return nullptr;
-
-  // If LR is not live out, we can insert the instruction if nothing else
-  // uses LR after it.
-  if (!SearchForUse(Start, MBB->end(), ARM::LR))
+  // just define LR anyway?
+  if (!RDA->isRegUsedAfter(Start, ARM::LR))
     return Start;
 
-  LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find suitable insertion point for"
-             << " LR\n");
   return nullptr;
 }
 
-void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
+void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
+                                    ReachingDefAnalysis *RDA,
+                                    MachineLoopInfo *MLI) {
   if (Revert)
     return;
 
@@ -320,18 +300,74 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
     return;
   }
 
-  InsertPt = Revert ? nullptr : IsSafeToDefineLR();
+  InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
   if (!InsertPt) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
     Revert = true;
+    return;
   } else
     LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt);
 
-  LLVM_DEBUG(if (IsTailPredicationLegal()) {
-               dbgs() << "ARM Loops: Will use tail predication to convert:\n";
+  // For tail predication, we need to provide the number of elements, instead
+  // of the iteration count, to the loop start instruction. The number of
+  // elements is provided to the vctp instruction, so we need to check that
+  // we can use this register at InsertPt.
+  if (!IsTailPredicationLegal())
+    return;
+
+  Register NumElements = VCTP->getOperand(1).getReg();
+
+  // If the register is defined within loop, then we can't perform TP.
+  // TODO: Check whether this is just a mov of a register that would be
+  // available.
+  if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
+    CannotTailPredicate = true;
+    return;
+  }
+
+  // We can't perform TP if the register does not hold the same value at
+  // InsertPt as the liveout value.
+  MachineBasicBlock *InsertBB = InsertPt->getParent();
+  if  (!RDA->hasSameReachingDef(InsertPt, &InsertBB->back(),
+                                NumElements)) {
+    CannotTailPredicate = true;
+    return;
+  }
+
+  // Especially in the case of while loops, InsertBB may not be the
+  // preheader, so we need to check that the register isn't redefined
+  // before entering the loop.
+  auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
+                                      Register NumElements) {
+    // NumElements is redefined in this block.
+    if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
+      return true;
+
+    // Don't continue searching up through multiple predecessors.
+    if (MBB->pred_size() > 1)
+      return true;
+
+    return false;
+  };
+
+  // First, find the block that looks like the preheader.
+  MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
+  if (!MBB) {
+    CannotTailPredicate = true;
+    return;
+  }
+
+  // Then search backwards for a def, until we get to InsertBB.
+  while (MBB != InsertBB) {
+    CannotTailPredicate = CannotProvideElements(MBB, NumElements);
+    if (CannotTailPredicate)
+      return;
+    MBB = *MBB->pred_begin();
+  }
+
+  LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication to convert:\n";
                for (auto *MI : VPTUsers)
-                 dbgs() << " - " << *MI;
-             });
+                 dbgs() << " - " << *MI;);
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -342,7 +378,8 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n");
 
-  auto &MLI = getAnalysis<MachineLoopInfo>();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  RDA = &getAnalysis<ReachingDefAnalysis>();
   MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
   MRI = &MF->getRegInfo();
   TII = static_cast<const ARMBaseInstrInfo*>(ST.getInstrInfo());
@@ -351,7 +388,7 @@ bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
   BBUtils->adjustBBOffsetsAfter(&MF->front());
 
   bool Changed = false;
-  for (auto ML : MLI) {
+  for (auto ML : *MLI) {
     if (!ML->getParentLoop())
       Changed |= ProcessLoop(ML);
   }
@@ -367,7 +404,14 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   for (auto I = ML->begin(), E = ML->end(); I != E; ++I)
     Changed |= ProcessLoop(*I);
 
-  LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML);
+  LLVM_DEBUG(dbgs() << "ARM Loops: Processing loop containing:\n";
+             if (auto *Preheader = ML->getLoopPreheader())
+               dbgs() << " - " << Preheader->getName() << "\n";
+             else if (auto *Preheader = MLI->findLoopPreheader(ML))
+               dbgs() << " - " << Preheader->getName() << "\n";
+             for (auto *MBB : ML->getBlocks())
+               dbgs() << " - " << MBB->getName() << "\n";
+            );
 
   // Search the given block for a loop start instruction. If one isn't found,
   // and there's only one predecessor block, search that one too.
@@ -383,28 +427,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   };
 
   LowOverheadLoop LoLoop(ML);
-  // Search the preheader for the start intrinsic, or look through the
-  // predecessors of the header to find exactly one set.iterations intrinsic.
+  // Search the preheader for the start intrinsic.
   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
   // with potentially multiple set.loop.iterations, so we need to enable this.
   if (auto *Preheader = ML->getLoopPreheader())
     LoLoop.Start = SearchForStart(Preheader);
-  else {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n"
-               << " - Performing manual predecessor search.\n");
-    MachineBasicBlock *Pred = nullptr;
-    for (auto *MBB : ML->getHeader()->predecessors()) {
-      if (!ML->contains(MBB)) {
-        if (Pred) {
-          LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n");
-          LoLoop.Start = nullptr;
-          break;
-        }
-        Pred = MBB;
-        LoLoop.Start = SearchForStart(MBB);
-      }
-    }
-  }
+  else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
+    LoLoop.Start = SearchForStart(Preheader);
+  else
+    return false;
 
   // Find the low-overhead loop components and decide whether or not to fall
   // back to a normal loop. Also look for a vctp instructions and decide
@@ -462,7 +493,7 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   if (!LoLoop.FoundAllComponents())
     return false;
 
-  LoLoop.CheckLegality(BBUtils.get());
+  LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
   Expand(LoLoop);
   return true;
 }
@@ -493,19 +524,15 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
 }
 
 bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
-                                        bool AllowFlags) const {
+                                        bool SetFlags) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
   MachineBasicBlock *MBB = MI->getParent();
 
-  // If nothing uses or defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
-  bool SetFlags = false;
-  if (AllowFlags) {
-    if (auto *Def = SearchForDef(MI, MBB->end(), ARM::CPSR)) {
-      if (!SearchForUse(MI, MBB->end(), ARM::CPSR) &&
-          Def->getOpcode() == ARM::t2LoopEnd)
-        SetFlags = true;
-    }
-  }
+  // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
+  if (SetFlags &&
+      (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
+       !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
+      SetFlags = false;
 
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
                                     TII->get(ARM::t2SUBri));
@@ -558,35 +585,45 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
   MachineInstr *Start = LoLoop.Start;
   MachineBasicBlock *MBB = InsertPt->getParent();
   bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart;
-  unsigned Opc = 0;
-
-  if (!LoLoop.IsTailPredicationLegal())
-    Opc = IsDo ? ARM::t2DLS : ARM::t2WLS;
-  else {
-    switch (LoLoop.VCTP->getOpcode()) {
-    case ARM::MVE_VCTP8:
-      Opc = IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8;
-      break;
-    case ARM::MVE_VCTP16:
-      Opc = IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16;
-      break;
-    case ARM::MVE_VCTP32:
-      Opc = IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32;
-      break;
-    case ARM::MVE_VCTP64:
-      Opc = IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64;
-      break;
-    }
-  }
+  unsigned Opc = LoLoop.getStartOpcode();
+  MachineOperand &Count = LoLoop.getCount();
 
   MachineInstrBuilder MIB =
     BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc));
 
   MIB.addDef(ARM::LR);
-  MIB.add(Start->getOperand(0));
+  MIB.add(Count);
   if (!IsDo)
     MIB.add(Start->getOperand(1));
 
+  // When using tail-predication, try to delete the dead code that was used to
+  // calculate the number of loop iterations.
+  if (LoLoop.IsTailPredicationLegal()) {
+    SmallVector<MachineInstr*, 4> Killed;
+    SmallVector<MachineInstr*, 4> Dead;
+    if (auto *Def = RDA->getReachingMIDef(Start,
+                                          Start->getOperand(0).getReg())) {
+      Killed.push_back(Def);
+
+      while (!Killed.empty()) {
+        MachineInstr *Def = Killed.back();
+        Killed.pop_back();
+        Dead.push_back(Def);
+        for (auto &MO : Def->operands()) {
+          if (!MO.isReg() || !MO.isKill())
+            continue;
+
+          MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
+          if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
+            Killed.push_back(Kill);
+        }
+      }
+      for (auto *MI : Dead)
+        MI->eraseFromParent();
+    }
+  }
+
+  // If we're inserting at a mov lr, then remove it as it's redundant.
   if (InsertPt != Start)
     InsertPt->eraseFromParent();
   Start->eraseFromParent();
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 7a57376a68953..eb4d39b01cbbf 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -98,9 +98,8 @@ ARMSubtarget::ARMSubtarget(const Triple &TT, const std::string &CPU,
                            const ARMBaseTargetMachine &TM, bool IsLittle,
                            bool MinSize)
     : ARMGenSubtargetInfo(TT, CPU, FS), UseMulOps(UseFusedMulOps),
-      ReservedGPRegisters(ARM::GPRRegClass.getNumRegs()), CPUString(CPU),
-      OptMinSize(MinSize), IsLittle(IsLittle), TargetTriple(TT),
-      Options(TM.Options), TM(TM),
+      CPUString(CPU), OptMinSize(MinSize), IsLittle(IsLittle),
+      TargetTriple(TT), Options(TM.Options), TM(TM),
       FrameLowering(initializeFrameLowering(CPU, FS)),
       // At this point initializeSubtargetDependencies has been called so
       // we can query directly.
@@ -254,18 +253,8 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       (Options.UnsafeFPMath || isTargetDarwin()))
     UseNEONForSinglePrecisionFP = true;
 
-  if (isRWPI() || (isTargetMachO() && !HasV6Ops))
-    ReservedGPRegisters.set(9);
-
-  // Throw an error when trying to reserve a target's FP register. It may
-  // be used by the compiler even when frame pointer elimination is enabled.
-  // FIXME: Throw this error if -frame-pointer=none is not set; otherwise
-  //        only emit a warning.
-  const int restFP = (useR7AsFramePointer()) ? 7 : 11;
-  if (isGPRegisterReserved(restFP))
-    report_fatal_error(
-        "Register r" + std::to_string(restFP) +
-        " has been specified but is used as the frame pointer for this target.");
+  if (isRWPI())
+    ReserveR9 = true;
 
   // If MVEVectorCostFactor is still 0 (has not been set to anything else), default it to 2
   if (MVEVectorCostFactor == 0)
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index c5836a3eca7b7..f582a92f65639 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -229,8 +229,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM = false;
 
-  // ReservedGPRegisters[i] - R#i is not available as a general purpose register
-  BitVector ReservedGPRegisters;
+  /// ReserveR9 - True if R9 is not available as a general purpose register.
+  bool ReserveR9 = false;
 
   /// NoMovt - True if MOVT / MOVW pairs are not used for materialization of
   /// 32-bit imms (including global addresses).
@@ -763,9 +763,8 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
   bool isAClass() const { return ARMProcClass == AClass; }
   bool isReadTPHard() const { return ReadTPHard; }
 
-  bool isGPRegisterReserved(size_t i) const { return ReservedGPRegisters[i]; }
-  unsigned getNumGPRegistersReserved() const {
-    return ReservedGPRegisters.count();
+  bool isR9Reserved() const {
+    return isTargetMachO() ? (ReserveR9 || !HasV6Ops) : ReserveR9;
   }
 
   bool useR7AsFramePointer() const {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 10f68542e7e1e..018ce3903c2d7 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -91,7 +91,6 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
   initializeARMParallelDSPPass(Registry);
-  initializeARMCodeGenPreparePass(Registry);
   initializeARMConstantIslandsPass(Registry);
   initializeARMExecutionDomainFixPass(Registry);
   initializeARMExpandPseudoPass(Registry);
@@ -422,7 +421,7 @@ void ARMPassConfig::addIRPasses() {
 
 void ARMPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createARMCodeGenPreparePass());
+    addPass(createTypePromotionPass());
   TargetPassConfig::addCodeGenPrepare();
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index c1fd01d2df9d5..5bb3bcaf10e77 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -76,9 +76,7 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
       ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization,
       ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass,
       ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls,
-      ARM::FeatureExecuteOnly, ARM::FeatureReserveR6, ARM::FeatureReserveR7,
-      ARM::FeatureReserveR8, ARM::FeatureReserveR9, ARM::FeatureReserveR10,
-      ARM::FeatureReserveR11, ARM::FeatureNoMovt,
+      ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt,
       ARM::FeatureNoNegativeImmediates
   };
 
diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt
index 5ce28f29defbc..b94a78ea9404f 100644
--- a/llvm/lib/Target/ARM/CMakeLists.txt
+++ b/llvm/lib/Target/ARM/CMakeLists.txt
@@ -25,7 +25,6 @@ add_llvm_target(ARMCodeGen
   ARMBasicBlockInfo.cpp
   ARMCallingConv.cpp
   ARMCallLowering.cpp
-  ARMCodeGenPrepare.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
   ARMExpandPseudoInsts.cpp
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 397f900447700..e8bc43dbe2ddb 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -485,10 +485,15 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     switch (VecTy->getNumElements()) {
     default:
       llvm_unreachable("unexpected number of lanes");
-    case 2:  VCTPID = Intrinsic::arm_vctp64; break;
-    case 4:  VCTPID = Intrinsic::arm_vctp32; break;
-    case 8:  VCTPID = Intrinsic::arm_vctp16; break;
-    case 16: VCTPID = Intrinsic::arm_vctp8; break;
+    case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
+    case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
+    case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
+
+      // FIXME: vctp64 currently not supported because the predicate
+      // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
+      // type, so problems happen at isel time.
+      // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
+      // purposes, but takes a v4i1 instead of a v2i1.
     }
     Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
     Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index aa3aca359cb8d..11cb1a162e2ba 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -64,6 +64,25 @@ inline static CondCodes getOppositeCondition(CondCodes CC) {
   case LE: return GT;
   }
 }
+
+/// getSwappedCondition - assume the flags are set by MI(a,b), return
+/// the condition code if we modify the instructions such that flags are
+/// set by MI(b,a).
+inline static ARMCC::CondCodes getSwappedCondition(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: return ARMCC::AL;
+  case ARMCC::EQ: return ARMCC::EQ;
+  case ARMCC::NE: return ARMCC::NE;
+  case ARMCC::HS: return ARMCC::LS;
+  case ARMCC::LO: return ARMCC::HI;
+  case ARMCC::HI: return ARMCC::LO;
+  case ARMCC::LS: return ARMCC::HS;
+  case ARMCC::GE: return ARMCC::LE;
+  case ARMCC::LT: return ARMCC::GT;
+  case ARMCC::GT: return ARMCC::LT;
+  case ARMCC::LE: return ARMCC::GE;
+  }
+}
 } // end namespace ARMCC
 
 namespace ARMVCC {
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index 3af29a2e698b3..a28816cc87b7d 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -829,9 +829,13 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
                             RecordAlignment);
   }
 
-  // Access key is the type name + reloc type + patched imm + access string,
+  // Access key is the
+  //   "llvm." + type name + ":" + reloc type + ":" + patched imm + "$" +
+  //   access string,
   // uniquely identifying one relocation.
-  AccessKey = TypeName + ":" + std::to_string(InfoKind) + ":" +
+  // The prefix "llvm." indicates this is a temporary global, which should
+  // not be emitted to ELF file.
+  AccessKey = "llvm." + TypeName + ":" + std::to_string(InfoKind) + ":" +
               std::to_string(PatchImm) + "$" + AccessKey;
 
   return Base;
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 7f52812179534..639ee2df96a9d 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -252,8 +252,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                   const MCSubtargetInfo *STI);
 
-  void expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                     const MCSubtargetInfo *STI, bool IsLoad);
+  void expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI, bool IsLoad);
+  void expandMem9Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                      const MCSubtargetInfo *STI, bool IsLoad);
 
   bool expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                const MCSubtargetInfo *STI);
@@ -1824,11 +1826,14 @@ static bool needsExpandMemInst(MCInst &Inst) {
 
   const MCOperandInfo &OpInfo = MCID.OpInfo[NumOp - 1];
   if (OpInfo.OperandType != MCOI::OPERAND_MEMORY &&
-      OpInfo.OperandType != MCOI::OPERAND_UNKNOWN)
+      OpInfo.OperandType != MCOI::OPERAND_UNKNOWN &&
+      OpInfo.OperandType != MipsII::OPERAND_MEM_SIMM9)
     return false;
 
   MCOperand &Op = Inst.getOperand(NumOp - 1);
   if (Op.isImm()) {
+    if (OpInfo.OperandType == MipsII::OPERAND_MEM_SIMM9)
+      return !isInt<9>(Op.getImm());
     // Offset can't exceed 16bit value.
     return !isInt<16>(Op.getImm());
   }
@@ -2133,7 +2138,15 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     // Check the offset of memory operand, if it is a symbol
     // reference or immediate we may have to expand instructions.
     if (needsExpandMemInst(Inst)) {
-      expandMemInst(Inst, IDLoc, Out, STI, MCID.mayLoad());
+      const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+      switch (MCID.OpInfo[MCID.getNumOperands() - 1].OperandType) {
+      case MipsII::OPERAND_MEM_SIMM9:
+        expandMem9Inst(Inst, IDLoc, Out, STI, MCID.mayLoad());
+        break;
+      default:
+        expandMem16Inst(Inst, IDLoc, Out, STI, MCID.mayLoad());
+        break;
+      }
       return getParser().hasPendingError();
     }
   }
@@ -3631,20 +3644,26 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
-                                  const MCSubtargetInfo *STI, bool IsLoad) {
-  const MCOperand &DstRegOp = Inst.getOperand(0);
+void MipsAsmParser::expandMem16Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI, bool IsLoad) {
+  unsigned NumOp = Inst.getNumOperands();
+  assert((NumOp == 3 || NumOp == 4) && "unexpected operands number");
+  unsigned StartOp = NumOp == 3 ? 0 : 1;
+
+  const MCOperand &DstRegOp = Inst.getOperand(StartOp);
   assert(DstRegOp.isReg() && "expected register operand kind");
-  const MCOperand &BaseRegOp = Inst.getOperand(1);
+  const MCOperand &BaseRegOp = Inst.getOperand(StartOp + 1);
   assert(BaseRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetOp = Inst.getOperand(StartOp + 2);
 
   MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned OpCode = Inst.getOpcode();
   unsigned DstReg = DstRegOp.getReg();
   unsigned BaseReg = BaseRegOp.getReg();
   unsigned TmpReg = DstReg;
 
-  const MCInstrDesc &Desc = getInstDesc(Inst.getOpcode());
-  int16_t DstRegClass = Desc.OpInfo[0].RegClass;
+  const MCInstrDesc &Desc = getInstDesc(OpCode);
+  int16_t DstRegClass = Desc.OpInfo[StartOp].RegClass;
   unsigned DstRegClassID =
       getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
   bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
@@ -3658,25 +3677,12 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       return;
   }
 
-  if (Inst.getNumOperands() > 3) {
-    const MCOperand &BaseRegOp = Inst.getOperand(2);
-    assert(BaseRegOp.isReg() && "expected register operand kind");
-    const MCOperand &ExprOp = Inst.getOperand(3);
-    assert(ExprOp.isExpr() && "expected expression oprand kind");
-
-    unsigned BaseReg = BaseRegOp.getReg();
-    const MCExpr *ExprOffset = ExprOp.getExpr();
-
-    MCOperand LoOperand = MCOperand::createExpr(
-        MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
-    MCOperand HiOperand = MCOperand::createExpr(
-        MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
-    TOut.emitSCWithSymOffset(Inst.getOpcode(), DstReg, BaseReg, HiOperand,
-                             LoOperand, TmpReg, IDLoc, STI);
-    return;
-  }
-
-  const MCOperand &OffsetOp = Inst.getOperand(2);
+  auto emitInstWithOffset = [&](const MCOperand &Off) {
+    if (NumOp == 3)
+      TOut.emitRRX(OpCode, DstReg, TmpReg, Off, IDLoc, STI);
+    else
+      TOut.emitRRRX(OpCode, DstReg, DstReg, TmpReg, Off, IDLoc, STI);
+  };
 
   if (OffsetOp.isImm()) {
     int64_t LoOffset = OffsetOp.getImm() & 0xffff;
@@ -3690,16 +3696,16 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     bool IsLargeOffset = HiOffset != 0;
 
     if (IsLargeOffset) {
-      bool Is32BitImm = (HiOffset >> 32) == 0;
+      bool Is32BitImm = isInt<32>(OffsetOp.getImm());
       if (loadImmediate(HiOffset, TmpReg, Mips::NoRegister, Is32BitImm, true,
                         IDLoc, Out, STI))
         return;
     }
 
     if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64)
-      TOut.emitRRR(isGP64bit() ? Mips::DADDu : Mips::ADDu, TmpReg, TmpReg,
-                   BaseReg, IDLoc, STI);
-    TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, LoOffset, IDLoc, STI);
+      TOut.emitRRR(ABI.ArePtrs64bit() ? Mips::DADDu : Mips::ADDu, TmpReg,
+                   TmpReg, BaseReg, IDLoc, STI);
+    emitInstWithOffset(MCOperand::createImm(int16_t(LoOffset)));
     return;
   }
 
@@ -3723,26 +3729,41 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
       loadAndAddSymbolAddress(Res.getSymA(), TmpReg, BaseReg,
                               !ABI.ArePtrs64bit(), IDLoc, Out, STI);
-      TOut.emitRRI(Inst.getOpcode(), DstReg, TmpReg, Res.getConstant(), IDLoc,
-                   STI);
+      emitInstWithOffset(MCOperand::createImm(int16_t(Res.getConstant())));
     } else {
       // FIXME: Implement 64-bit case.
       // 1) lw $8, sym => lui $8,  %hi(sym)
       //                  lw  $8,  %lo(sym)($8)
       // 2) sw $8, sym => lui $at, %hi(sym)
       //                  sw  $8,  %lo(sym)($at)
-      const MCExpr *ExprOffset = OffsetOp.getExpr();
+      const MCExpr *OffExpr = OffsetOp.getExpr();
       MCOperand LoOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_LO, ExprOffset, getContext()));
+          MipsMCExpr::create(MipsMCExpr::MEK_LO, OffExpr, getContext()));
       MCOperand HiOperand = MCOperand::createExpr(
-          MipsMCExpr::create(MipsMCExpr::MEK_HI, ExprOffset, getContext()));
-
-      // Generate the base address in TmpReg.
-      TOut.emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
-      if (BaseReg != Mips::ZERO)
-        TOut.emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
-      // Emit the load or store with the adjusted base and offset.
-      TOut.emitRRX(Inst.getOpcode(), DstReg, TmpReg, LoOperand, IDLoc, STI);
+          MipsMCExpr::create(MipsMCExpr::MEK_HI, OffExpr, getContext()));
+
+      if (ABI.IsN64()) {
+        MCOperand HighestOperand = MCOperand::createExpr(
+            MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, OffExpr, getContext()));
+        MCOperand HigherOperand = MCOperand::createExpr(
+            MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, OffExpr, getContext()));
+
+        TOut.emitRX(Mips::LUi, TmpReg, HighestOperand, IDLoc, STI);
+        TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HigherOperand, IDLoc, STI);
+        TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
+        TOut.emitRRX(Mips::DADDiu, TmpReg, TmpReg, HiOperand, IDLoc, STI);
+        TOut.emitRRI(Mips::DSLL, TmpReg, TmpReg, 16, IDLoc, STI);
+        if (BaseReg != Mips::ZERO && BaseReg != Mips::ZERO_64)
+          TOut.emitRRR(Mips::DADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+        emitInstWithOffset(LoOperand);
+      } else {
+        // Generate the base address in TmpReg.
+        TOut.emitRX(Mips::LUi, TmpReg, HiOperand, IDLoc, STI);
+        if (BaseReg != Mips::ZERO)
+          TOut.emitRRR(Mips::ADDu, TmpReg, TmpReg, BaseReg, IDLoc, STI);
+        // Emit the load or store with the adjusted base and offset.
+        emitInstWithOffset(LoOperand);
+      }
     }
     return;
   }
@@ -3750,6 +3771,64 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   llvm_unreachable("unexpected operand type");
 }
 
+void MipsAsmParser::expandMem9Inst(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                   const MCSubtargetInfo *STI, bool IsLoad) {
+  unsigned NumOp = Inst.getNumOperands();
+  assert((NumOp == 3 || NumOp == 4) && "unexpected operands number");
+  unsigned StartOp = NumOp == 3 ? 0 : 1;
+
+  const MCOperand &DstRegOp = Inst.getOperand(StartOp);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  const MCOperand &BaseRegOp = Inst.getOperand(StartOp + 1);
+  assert(BaseRegOp.isReg() && "expected register operand kind");
+  const MCOperand &OffsetOp = Inst.getOperand(StartOp + 2);
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned OpCode = Inst.getOpcode();
+  unsigned DstReg = DstRegOp.getReg();
+  unsigned BaseReg = BaseRegOp.getReg();
+  unsigned TmpReg = DstReg;
+
+  const MCInstrDesc &Desc = getInstDesc(OpCode);
+  int16_t DstRegClass = Desc.OpInfo[StartOp].RegClass;
+  unsigned DstRegClassID =
+      getContext().getRegisterInfo()->getRegClass(DstRegClass).getID();
+  bool IsGPR = (DstRegClassID == Mips::GPR32RegClassID) ||
+               (DstRegClassID == Mips::GPR64RegClassID);
+
+  if (!IsLoad || !IsGPR || (BaseReg == DstReg)) {
+    // At this point we need AT to perform the expansions
+    // and we exit if it is not available.
+    TmpReg = getATReg(IDLoc);
+    if (!TmpReg)
+      return;
+  }
+
+  auto emitInst = [&]() {
+    if (NumOp == 3)
+      TOut.emitRRX(OpCode, DstReg, TmpReg, MCOperand::createImm(0), IDLoc, STI);
+    else
+      TOut.emitRRRX(OpCode, DstReg, DstReg, TmpReg, MCOperand::createImm(0),
+                    IDLoc, STI);
+  };
+
+  if (OffsetOp.isImm()) {
+    loadImmediate(OffsetOp.getImm(), TmpReg, BaseReg, !ABI.ArePtrs64bit(), true,
+                  IDLoc, Out, STI);
+    emitInst();
+    return;
+  }
+
+  if (OffsetOp.isExpr()) {
+    loadAndAddSymbolAddress(OffsetOp.getExpr(), TmpReg, BaseReg,
+                            !ABI.ArePtrs64bit(), IDLoc, Out, STI);
+    emitInst();
+    return;
+  }
+
+  llvm_unreachable("unexpected operand type");
+}
+
 bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
                                             MCStreamer &Out,
                                             const MCSubtargetInfo *STI) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 3c11edfc3fc78..02ab5ede2c1a4 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -16,6 +16,7 @@
 #include "MipsFixupKinds.h"
 #include "MipsMCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -127,6 +128,12 @@ namespace MipsII {
     HasFCCRegOperand = 1 << 6
 
   };
+
+  enum OperandType : unsigned {
+    OPERAND_FIRST_MIPS_MEM_IMM = MCOI::OPERAND_FIRST_TARGET,
+    OPERAND_MEM_SIMM9 = OPERAND_FIRST_MIPS_MEM_IMM,
+    OPERAND_LAST_MIPS_MEM_IMM = OPERAND_MEM_SIMM9
+  };
 }
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index d84e4eada6466..d0b3c204730fb 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -12,6 +12,7 @@
 
 #include "MipsMCTargetDesc.h"
 #include "MipsAsmBackend.h"
+#include "MipsBaseInfo.h"
 #include "MipsELFStreamer.h"
 #include "MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 3ff9c722484bf..bdfb70aa98131 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -74,27 +74,23 @@ void MipsRegInfoRecord::SetPhysRegUsed(unsigned Reg,
                                        const MCRegisterInfo *MCRegInfo) {
   unsigned Value = 0;
 
-  for (MCSubRegIterator SubRegIt(Reg, MCRegInfo, true); SubRegIt.isValid();
-       ++SubRegIt) {
-    unsigned CurrentSubReg = *SubRegIt;
-
-    unsigned EncVal = MCRegInfo->getEncodingValue(CurrentSubReg);
+  for (const MCPhysReg &SubReg : MCRegInfo->subregs_inclusive(Reg)) {
+    unsigned EncVal = MCRegInfo->getEncodingValue(SubReg);
     Value |= 1 << EncVal;
 
-    if (GPR32RegClass->contains(CurrentSubReg) ||
-        GPR64RegClass->contains(CurrentSubReg))
+    if (GPR32RegClass->contains(SubReg) || GPR64RegClass->contains(SubReg))
       ri_gprmask |= Value;
-    else if (COP0RegClass->contains(CurrentSubReg))
+    else if (COP0RegClass->contains(SubReg))
       ri_cprmask[0] |= Value;
     // MIPS COP1 is the FPU.
-    else if (FGR32RegClass->contains(CurrentSubReg) ||
-             FGR64RegClass->contains(CurrentSubReg) ||
-             AFGR64RegClass->contains(CurrentSubReg) ||
-             MSA128BRegClass->contains(CurrentSubReg))
+    else if (FGR32RegClass->contains(SubReg) ||
+             FGR64RegClass->contains(SubReg) ||
+             AFGR64RegClass->contains(SubReg) ||
+             MSA128BRegClass->contains(SubReg))
       ri_cprmask[1] |= Value;
-    else if (COP2RegClass->contains(CurrentSubReg))
+    else if (COP2RegClass->contains(SubReg))
       ri_cprmask[2] |= Value;
-    else if (COP3RegClass->contains(CurrentSubReg))
+    else if (COP3RegClass->contains(SubReg))
       ri_cprmask[3] |= Value;
   }
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index b6dae9f6dea82..054dc79f4aa91 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -34,11 +34,6 @@ static cl::opt<bool> RoundSectionSizes(
     cl::desc("Round section sizes up to the section alignment"), cl::Hidden);
 } // end anonymous namespace
 
-static bool isMipsR6(const MCSubtargetInfo *STI) {
-  return STI->getFeatureBits()[Mips::FeatureMips32r6] ||
-         STI->getFeatureBits()[Mips::FeatureMips64r6];
-}
-
 static bool isMicroMips(const MCSubtargetInfo *STI) {
   return STI->getFeatureBits()[Mips::FeatureMicroMips];
 }
@@ -332,36 +327,6 @@ void MipsTargetStreamer::emitStoreWithImmOffset(
   emitRRI(Opcode, SrcReg, ATReg, LoOffset, IDLoc, STI);
 }
 
-/// Emit a store instruction with an symbol offset.
-void MipsTargetStreamer::emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg,
-                                             unsigned BaseReg,
-                                             MCOperand &HiOperand,
-                                             MCOperand &LoOperand,
-                                             unsigned ATReg, SMLoc IDLoc,
-                                             const MCSubtargetInfo *STI) {
-  // sc $8, sym => lui $at, %hi(sym)
-  //               sc $8, %lo(sym)($at)
-
-  // Generate the base address in ATReg.
-  emitRX(Mips::LUi, ATReg, HiOperand, IDLoc, STI);
-  if (!isMicroMips(STI) && isMipsR6(STI)) {
-    // For non-micromips r6 offset for 'sc' is not in the lower 16 bits so we
-    // put it in 'at'.
-    // sc $8, sym => lui $at, %hi(sym)
-    //               addiu $at, $at, %lo(sym)
-    //               sc $8, 0($at)
-    emitRRX(Mips::ADDiu, ATReg, ATReg, LoOperand, IDLoc, STI);
-    MCOperand Offset = MCOperand::createImm(0);
-    // Emit the store with the adjusted base and offset.
-    emitRRRX(Opcode, SrcReg, SrcReg, ATReg, Offset, IDLoc, STI);
-  } else {
-    if (BaseReg != Mips::ZERO)
-      emitRRR(Mips::ADDu, ATReg, ATReg, BaseReg, IDLoc, STI);
-    // Emit the store with the adjusted base and offset.
-    emitRRRX(Opcode, SrcReg, SrcReg, ATReg, LoOperand, IDLoc, STI);
-  }
-}
-
 /// Emit a load instruction with an immediate offset. DstReg and TmpReg are
 /// permitted to be the same register iff DstReg is distinct from BaseReg and
 /// DstReg is a GPR. It is the callers responsibility to identify such cases
diff --git a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
index a735d45ddbfcf..9607d008bc979 100644
--- a/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -765,12 +765,12 @@ class LL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
   InstrItinClass Itinerary = itin;
 }
 
-class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+class LL_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9_exp, II_LL>;
 
 class SC_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
                       InstrItinClass itin> {
   dag OutOperandList = (outs GPROpnd:$dst);
-  dag InOperandList = (ins GPROpnd:$rt, mem_simm9:$addr);
+  dag InOperandList = (ins GPROpnd:$rt, mem_simm9_exp:$addr);
   string AsmString = !strconcat(instr_asm, "\t$rt, $addr");
   list<dag> Pattern = [];
   bit mayStore = 1;
diff --git a/llvm/lib/Target/Mips/Mips64r6InstrInfo.td b/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
index efebd77e531fe..33132d9ede92a 100644
--- a/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -75,7 +75,7 @@ class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd, II_DMUL, mul>;
 class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd, II_DMUL>;
 class LDPC_DESC    : PCREL_DESC_BASE<"ldpc", GPR64Opnd, simm18_lsl3, II_LDPC>;
 class LWUPC_DESC   : PCREL_DESC_BASE<"lwupc", GPR32Opnd, simm19_lsl2, II_LWUPC>;
-class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simmptr, II_LLD>;
+class LLD_R6_DESC   : LL_R6_DESC_BASE<"lld", GPR64Opnd, mem_simm9_exp, II_LLD>;
 class SCD_R6_DESC   : SC_R6_DESC_BASE<"scd", GPR64Opnd, II_SCD>;
 class SELEQZ64_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR64Opnd>;
 class SELNEZ64_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR64Opnd>;
@@ -106,7 +106,7 @@ class JIC64_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR64Opnd,
   list<Register> Defs = [AT];
 }
 
-class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9, II_LL>;
+class LL64_R6_DESC : LL_R6_DESC_BASE<"ll", GPR32Opnd, mem_simm9_exp, II_LL>;
 class SC64_R6_DESC : SC_R6_DESC_BASE<"sc", GPR32Opnd, II_SC>;
 
 class JR_HB64_R6_DESC : JR_HB_DESC_BASE<"jr.hb", GPR64Opnd> {
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 353f10c8c64c1..be556cfba4dc3 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -257,6 +257,10 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     if (emitPseudoExpansionLowering(*OutStreamer, &*I))
       continue;
 
+    // Skip the BUNDLE pseudo instruction and lower the contents
+    if (I->isBundle())
+      continue;
+
     if (I->getOpcode() == Mips::PseudoReturn ||
         I->getOpcode() == Mips::PseudoReturn64 ||
         I->getOpcode() == Mips::PseudoIndirectBranch ||
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 3c47e74de46d6..60d14933a2e03 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -612,12 +612,18 @@ bool MipsDelaySlotFiller::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
       if (MipsCompactBranchPolicy.getValue() != CB_Always ||
            !TII->getEquivalentCompactForm(I)) {
         if (searchBackward(MBB, *I)) {
+          LLVM_DEBUG(dbgs() << DEBUG_TYPE ": found instruction for delay slot"
+                                          " in backwards search.\n");
           Filled = true;
         } else if (I->isTerminator()) {
           if (searchSuccBBs(MBB, I)) {
             Filled = true;
+            LLVM_DEBUG(dbgs() << DEBUG_TYPE ": found instruction for delay slot"
+                                            " in successor BB search.\n");
           }
         } else if (searchForward(MBB, I)) {
+          LLVM_DEBUG(dbgs() << DEBUG_TYPE ": found instruction for delay slot"
+                                          " in forwards search.\n");
           Filled = true;
         }
       }
@@ -662,6 +668,8 @@ bool MipsDelaySlotFiller::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Bundle the NOP to the instruction with the delay slot.
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": could not fill delay slot for ";
+               I->dump());
     BuildMI(MBB, std::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, std::next(I, 2));
     ++FilledSlots;
@@ -679,13 +687,25 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
   for (IterTy I = Begin; I != End;) {
     IterTy CurrI = I;
     ++I;
-
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE ": checking instruction: "; CurrI->dump());
     // skip debug value
-    if (CurrI->isDebugInstr())
+    if (CurrI->isDebugInstr()) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE ": ignoring debug instruction: ";
+                 CurrI->dump());
       continue;
+    }
+
+    if (CurrI->isBundle()) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE ": ignoring BUNDLE instruction: ";
+                 CurrI->dump());
+      continue;
+    }
 
-    if (terminateSearch(*CurrI))
+    if (terminateSearch(*CurrI)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE ": should terminate search: ";
+                 CurrI->dump());
       break;
+    }
 
     assert((!CurrI->isCall() && !CurrI->isReturn() && !CurrI->isBranch()) &&
            "Cannot put calls, returns or branches in delay slot.");
@@ -731,6 +751,9 @@ bool MipsDelaySlotFiller::searchRange(MachineBasicBlock &MBB, IterTy Begin,
        continue;
 
     Filler = CurrI;
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE ": found instruction for delay slot: ";
+               CurrI->dump());
+
     return true;
   }
 
@@ -751,8 +774,11 @@ bool MipsDelaySlotFiller::searchBackward(MachineBasicBlock &MBB,
 
   MachineBasicBlock::iterator SlotI = Slot;
   if (!searchRange(MBB, ++SlotI.getReverse(), MBB.rend(), RegDU, MemDU, Slot,
-                   Filler))
+                   Filler)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE ": could not find instruction for delay "
+                                    "slot using backwards search.\n");
     return false;
+  }
 
   MBB.splice(std::next(SlotI), &MBB, Filler.getReverse());
   MIBundleBuilder(MBB, SlotI, std::next(SlotI, 2));
@@ -772,8 +798,11 @@ bool MipsDelaySlotFiller::searchForward(MachineBasicBlock &MBB,
 
   RegDU.setCallerSaved(*Slot);
 
-  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Slot, Filler))
+  if (!searchRange(MBB, std::next(Slot), MBB.end(), RegDU, NM, Slot, Filler)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE ": could not find instruction for delay "
+                                    "slot using forwards search.\n");
     return false;
+  }
 
   MBB.splice(std::next(Slot), &MBB, Filler);
   MIBundleBuilder(MBB, Slot, std::next(Slot, 2));
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index da8be7c640b8b..3b626383d1d5a 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -1140,6 +1140,13 @@ def simm12 : Operand<i32> {
   let DecoderMethod = "DecodeSimm12";
 }
 
+def mem_simm9_exp : mem_generic {
+  let MIOperandInfo = (ops ptr_rc, simm9);
+  let ParserMatchClass = MipsMemSimmPtrAsmOperand;
+  let OperandNamespace = "MipsII";
+  let OperandType = "OPERAND_MEM_SIMM9";
+}
+
 foreach I = {9, 10, 11, 12, 16} in
   def mem_simm # I : mem_generic {
     let MIOperandInfo = (ops ptr_rc, !cast<Operand>("simm" # I));
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.cpp b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
index fd984058a2bf5..66e04bda2af32 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.cpp
@@ -34,7 +34,7 @@ void MipsMCInstLower::Initialize(MCContext *C) {
 
 MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                               MachineOperandType MOTy,
-                                              unsigned Offset) const {
+                                              int64_t Offset) const {
   MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
   MipsMCExpr::MipsExprKind TargetKind = MipsMCExpr::MEK_None;
   bool IsGpOff = false;
@@ -161,9 +161,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, Kind, *Ctx);
 
   if (Offset) {
-    // Assume offset is never negative.
-    assert(Offset > 0);
-
+    // Note: Offset can also be negative
     Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(Offset, *Ctx),
                                    *Ctx);
   }
@@ -177,7 +175,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 }
 
 MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
-                                        unsigned offset) const {
+                                        int64_t offset) const {
   MachineOperandType MOTy = MO.getType();
 
   switch (MOTy) {
diff --git a/llvm/lib/Target/Mips/MipsMCInstLower.h b/llvm/lib/Target/Mips/MipsMCInstLower.h
index 29af6f21de826..605a124bf1026 100644
--- a/llvm/lib/Target/Mips/MipsMCInstLower.h
+++ b/llvm/lib/Target/Mips/MipsMCInstLower.h
@@ -35,11 +35,11 @@ class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
 
   void Initialize(MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+  MCOperand LowerOperand(const MachineOperand &MO, int64_t offset = 0) const;
 
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
-                               MachineOperandType MOTy, unsigned Offset) const;
+                               MachineOperandType MOTy, int64_t Offset) const;
   MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
                       MipsMCExpr::MipsExprKind Kind) const;
   void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h
index 298d056ce2c35..b389ba8938c4b 100644
--- a/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -156,10 +156,6 @@ class MipsTargetStreamer : public MCTargetStreamer {
                               unsigned BaseReg, int64_t Offset,
                               function_ref<unsigned()> GetATReg, SMLoc IDLoc,
                               const MCSubtargetInfo *STI);
-  void emitSCWithSymOffset(unsigned Opcode, unsigned SrcReg, unsigned BaseReg,
-                           MCOperand &HiOperand, MCOperand &LoOperand,
-                           unsigned ATReg, SMLoc IDLoc,
-                           const MCSubtargetInfo *STI);
   void emitLoadWithImmOffset(unsigned Opcode, unsigned DstReg, unsigned BaseReg,
                              int64_t Offset, unsigned TmpReg, SMLoc IDLoc,
                              const MCSubtargetInfo *STI);
diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 28d7840d54124..1893d6e32c9ac 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -29,7 +29,7 @@ add_llvm_target(PowerPCCodeGen
   PPCEarlyReturn.cpp
   PPCFastISel.cpp
   PPCFrameLowering.cpp
-  PPCLoopPreIncPrep.cpp
+  PPCLoopInstrFormPrep.cpp
   PPCMCInstLower.cpp
   PPCMachineFunctionInfo.cpp
   PPCMachineScheduler.cpp
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 1216cd7272893..a61c34ca6f14b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -87,4 +87,5 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
   assert(!IsLittleEndian && "Little-endian XCOFF not supported.");
   CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
   ZeroDirective = "\t.space\t";
+  SymbolsHaveSMC = true;
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index a9717bfc3082a..00df9e41fdae0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -108,8 +109,11 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
       : PPCTargetStreamer(S), OS(OS) {}
 
   void emitTCEntry(const MCSymbol &S) override {
+    const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
     OS << "\t.tc ";
-    OS << S.getName();
+    OS << (MAI->getSymbolsHaveSMC()
+               ? cast<MCSymbolXCOFF>(S).getUnqualifiedName()
+               : S.getName());
     OS << "[TC],";
     OS << S.getName();
     OS << '\n';
@@ -243,7 +247,10 @@ class PPCTargetXCOFFStreamer : public PPCTargetStreamer {
   PPCTargetXCOFFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
 
   void emitTCEntry(const MCSymbol &S) override {
-    report_fatal_error("TOC entries not supported yet.");
+    const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
+    const unsigned PointerSize = MAI->getCodePointerSize();
+    Streamer.EmitValueToAlignment(PointerSize);
+    Streamer.EmitSymbolValue(&S, PointerSize);
   }
 
   void emitMachine(StringRef CPU) override {
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index 5a830d2294116..a83509f0e6870 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -34,7 +34,7 @@ namespace llvm {
 #ifndef NDEBUG
   FunctionPass *createPPCCTRLoopsVerify();
 #endif
-  FunctionPass *createPPCLoopPreIncPrepPass(PPCTargetMachine &TM);
+  FunctionPass *createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM);
   FunctionPass *createPPCTOCRegDepsPass();
   FunctionPass *createPPCEarlyReturnPass();
   FunctionPass *createPPCVSXCopyPass();
@@ -60,7 +60,7 @@ namespace llvm {
 #ifndef NDEBUG
   void initializePPCCTRLoopsVerifyPass(PassRegistry&);
 #endif
-  void initializePPCLoopPreIncPrepPass(PassRegistry&);
+  void initializePPCLoopInstrFormPrepPass(PassRegistry&);
   void initializePPCTOCRegDepsPass(PassRegistry&);
   void initializePPCEarlyReturnPass(PassRegistry&);
   void initializePPCVSXCopyPass(PassRegistry&);
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index de007d3b8d0b2..1d5396912ef08 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -51,6 +51,8 @@ def DirectivePwr6x
 def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
+def DirectivePwrFuture
+    : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
 def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
                                         "Enable 64-bit instructions">;
@@ -209,36 +211,94 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
 // came before them, the idea is to make implementations of new processors
 // less error prone and easier to read.
 // Namely:
-//     list<SubtargetFeature> Power8FeatureList = ...
-//     list<SubtargetFeature> FutureProcessorSpecificFeatureList =
-//         [ features that Power8 does not support ]
-//     list<SubtargetFeature> FutureProcessorFeatureList =
-//         !listconcat(Power8FeatureList, FutureProcessorSpecificFeatureList)
+//     list<SubtargetFeature> P8InheritableFeatures = ...
+//     list<SubtargetFeature> FutureProcessorAddtionalFeatures =
+//         [ features that Power8 does not support but inheritable ]
+//     list<SubtargetFeature> FutureProcessorSpecificFeatures =
+//         [ features that Power8 does not support and not inheritable ]
+//     list<SubtargetFeature> FutureProcessorInheritableFeatures =
+//         !listconcat(P8InheritableFeatures, FutureProcessorAddtionalFeatures)
+//     list<SubtargetFeature> FutureProcessorFeatures =
+//         !listconcat(FutureProcessorInheritableFeatures,
+//                     FutureProcessorSpecificFeatures)
 
 // Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
 // well as providing a single point of definition if the feature set will be
 // used elsewhere.
 def ProcessorFeatures {
-  list<SubtargetFeature> Power7FeatureList =
-      [DirectivePwr7, FeatureAltivec, FeatureVSX,
-       FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
-       FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
-       FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
-       FeatureFPRND, FeatureFPCVT, FeatureISEL,
-       FeaturePOPCNTD, FeatureCMPB, FeatureLDBRX,
-       Feature64Bit /*, Feature64BitRegs */,
-       FeatureBPERMD, FeatureExtDiv,
-       FeatureMFTB, DeprecatedDST, FeatureTwoConstNR];
-  list<SubtargetFeature> Power8SpecificFeatures =
-      [DirectivePwr8, FeatureP8Altivec, FeatureP8Vector, FeatureP8Crypto,
-       FeatureHTM, FeatureDirectMove, FeatureICBT, FeaturePartwordAtomic];
-  list<SubtargetFeature> Power8FeatureList =
-      !listconcat(Power7FeatureList, Power8SpecificFeatures);
-  list<SubtargetFeature> Power9SpecificFeatures =
-      [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0,
-       FeatureVectorsUseTwoUnits, FeaturePPCPreRASched, FeaturePPCPostRASched];
-  list<SubtargetFeature> Power9FeatureList =
-      !listconcat(Power8FeatureList, Power9SpecificFeatures);
+  // Power7
+  list<SubtargetFeature> P7InheritableFeatures = [DirectivePwr7,
+                                                  FeatureAltivec,
+                                                  FeatureVSX,
+                                                  FeatureMFOCRF,
+                                                  FeatureFCPSGN,
+                                                  FeatureFSqrt,
+                                                  FeatureFRE,
+                                                  FeatureFRES,
+                                                  FeatureFRSQRTE,
+                                                  FeatureFRSQRTES,
+                                                  FeatureRecipPrec,
+                                                  FeatureSTFIWX,
+                                                  FeatureLFIWAX,
+                                                  FeatureFPRND,
+                                                  FeatureFPCVT,
+                                                  FeatureISEL,
+                                                  FeaturePOPCNTD,
+                                                  FeatureCMPB,
+                                                  FeatureLDBRX,
+                                                  Feature64Bit,
+                                                  /* Feature64BitRegs, */
+                                                  FeatureBPERMD,
+                                                  FeatureExtDiv,
+                                                  FeatureMFTB,
+                                                  DeprecatedDST,
+                                                  FeatureTwoConstNR];
+  list<SubtargetFeature> P7SpecificFeatures = [];
+  list<SubtargetFeature> P7Features =
+    !listconcat(P7InheritableFeatures, P7SpecificFeatures);
+
+  // Power8
+  list<SubtargetFeature> P8AdditionalFeatures = [DirectivePwr8,
+                                                 FeatureP8Altivec,
+                                                 FeatureP8Vector,
+                                                 FeatureP8Crypto,
+                                                 FeatureHTM,
+                                                 FeatureDirectMove,
+                                                 FeatureICBT,
+                                                 FeaturePartwordAtomic];
+  list<SubtargetFeature> P8SpecificFeatures = [];
+  list<SubtargetFeature> P8InheritableFeatures =
+    !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
+  list<SubtargetFeature> P8Features =
+    !listconcat(P8InheritableFeatures, P8SpecificFeatures);
+
+  // Power9
+  list<SubtargetFeature> P9AdditionalFeatures = [DirectivePwr9,
+                                                 FeatureP9Altivec,
+                                                 FeatureP9Vector,
+                                                 FeatureISA3_0];
+  // Some features are unique to Power9 and there is no reason to assume
+  // they will be part of any future CPUs. One example is the narrower
+  // dispatch for vector operations than scalar ones. For the time being,
+  // this list also includes scheduling-related features since we do not have
+  // enough info to create custom scheduling strategies for future CPUs.
+  list<SubtargetFeature> P9SpecificFeatures = [FeatureVectorsUseTwoUnits,
+                                               FeaturePPCPreRASched,
+                                               FeaturePPCPostRASched];
+  list<SubtargetFeature> P9InheritableFeatures =
+    !listconcat(P8InheritableFeatures, P9AdditionalFeatures);
+  list<SubtargetFeature> P9Features =
+    !listconcat(P9InheritableFeatures, P9SpecificFeatures);
+
+  // Future
+  // For future CPU we assume that all of the existing features from Power 9
+  // still exist with the exception of those we know are Power 9 specific.
+  list<SubtargetFeature> FutureAdditionalFeatures = [];
+  list<SubtargetFeature> FutureSpecificFeatures = [];
+  list<SubtargetFeature> FutureInheritableFeatures =
+    !listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
+  list<SubtargetFeature> FutureFeatures =
+    !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
 
 // Note: Future features to add when support is extended to more
@@ -438,9 +498,12 @@ def : ProcessorModel<"pwr6x", G5Model,
                    FeatureSTFIWX, FeatureLFIWAX, FeatureCMPB,
                    FeatureFPRND, Feature64Bit,
                    FeatureMFTB, DeprecatedDST]>;
-def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.Power7FeatureList>;
-def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.Power8FeatureList>;
-def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.Power9FeatureList>;
+def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
+def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
+def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
+// No scheduler model for future CPU.
+def : ProcessorModel<"future", NoSchedModel,
+                  ProcessorFeatures.FutureFeatures>;
 def : Processor<"ppc", G3Itineraries, [Directive32, FeatureHardFloat,
                                        FeatureMFTB]>;
 def : Processor<"ppc32", G3Itineraries, [Directive32, FeatureHardFloat,
@@ -451,7 +514,7 @@ def : ProcessorModel<"ppc64", G5Model,
                    FeatureFRSQRTE, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */,
                    FeatureMFTB]>;
-def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.Power8FeatureList>;
+def : ProcessorModel<"ppc64le", P8Model, ProcessorFeatures.P8Features>;
 
 //===----------------------------------------------------------------------===//
 // Calling Conventions
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 53dbb02bb8e4b..9b8fb4ddd311d 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -82,6 +83,8 @@ class PPCAsmPrinter : public AsmPrinter {
   const PPCSubtarget *Subtarget = nullptr;
   StackMaps SM;
 
+  virtual MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO);
+
 public:
   explicit PPCAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer)
@@ -161,6 +164,11 @@ class PPCDarwinAsmPrinter : public PPCAsmPrinter {
 };
 
 class PPCAIXAsmPrinter : public PPCAsmPrinter {
+private:
+  static void ValidateGV(const GlobalVariable *GV);
+protected:
+  MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO) override;
+
 public:
   PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
       : PPCAsmPrinter(TM, std::move(Streamer)) {}
@@ -514,17 +522,16 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
 
 /// Map a machine operand for a TOC pseudo-machine instruction to its
 /// corresponding MCSymbol.
-static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO,
-                                           AsmPrinter &AP) {
+MCSymbol *PPCAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_GlobalAddress:
-    return AP.getSymbol(MO.getGlobal());
+    return getSymbol(MO.getGlobal());
   case MachineOperand::MO_ConstantPoolIndex:
-    return AP.GetCPISymbol(MO.getIndex());
+    return GetCPISymbol(MO.getIndex());
   case MachineOperand::MO_JumpTableIndex:
-    return AP.GetJTISymbol(MO.getIndex());
+    return GetJTISymbol(MO.getIndex());
   case MachineOperand::MO_BlockAddress:
-    return AP.GetBlockAddressSymbol(MO.getBlockAddress());
+    return GetBlockAddressSymbol(MO.getBlockAddress());
   default:
     llvm_unreachable("Unexpected operand type to get symbol.");
   }
@@ -688,7 +695,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for LWZtoc.");
 
     // Map the operand to its corresponding MCSymbol.
-    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO);
 
     // Create a reference to the GOT entry for the symbol. The GOT entry will be
     // synthesized later.
@@ -749,7 +756,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // global address operand to be a reference to the TOC entry we will
     // synthesize later.
     MCSymbol *TOCEntry =
-        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
+        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO));
 
     const MCSymbolRefExpr::VariantKind VK =
         IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
@@ -775,7 +782,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for ADDIStocHA.");
 
     // Map the machine operand to its corresponding MCSymbol.
-    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
 
     // Always use TOC on AIX. Map the global address operand to be a reference
     // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
@@ -805,7 +812,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for LWZtocL.");
 
     // Map the machine operand to its corresponding MCSymbol.
-    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
 
     // Always use TOC on AIX. Map the global address operand to be a reference
     // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
@@ -835,7 +842,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
            "Invalid operand for ADDIStocHA8!");
 
-    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
 
     const bool GlobalToc =
         MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
@@ -881,7 +888,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         "LDtocL used on symbol that could be accessed directly is "
         "invalid. Must match ADDIStocHA8."));
 
-    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
 
     if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
@@ -911,7 +918,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         "Interposable definitions must use indirect access."));
 
     const MCExpr *Exp =
-        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
+        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO),
                                 MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -1603,7 +1610,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     // FIXME: why is power8 missing here?
     "ppc64",
     "ppc64le",
-    "power9"
+    "power9",
+    "future"
   };
 
   // Get the numerically largest directive.
@@ -1735,7 +1743,7 @@ void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   return AsmPrinter::SetupMachineFunction(MF);
 }
 
-void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) {
   // Early error checking limiting what is supported.
   if (GV->isThreadLocal())
     report_fatal_error("Thread local not yet supported on AIX.");
@@ -1745,6 +1753,19 @@ void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   if (GV->hasComdat())
     report_fatal_error("COMDAT not yet supported by AIX.");
+}
+
+void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  ValidateGV(GV);
+
+  // External global variables are already handled.
+  if (!GV->hasInitializer())
+    return;
+
+  // Create the symbol, set its storage class.
+  MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
+  GVSym->setStorageClass(
+      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
 
   SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
   if ((!GVKind.isCommon() && !GVKind.isBSS() && !GVKind.isData() &&
@@ -1758,11 +1779,6 @@ void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
       getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
   OutStreamer->SwitchSection(Csect);
-
-  // Create the symbol, set its storage class, and emit it.
-  MCSymbolXCOFF *GVSym = cast<MCSymbolXCOFF>(getSymbol(GV));
-  GVSym->setStorageClass(
-      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
   GVSym->setContainingCsect(Csect);
 
   const DataLayout &DL = GV->getParent()->getDataLayout();
@@ -1801,7 +1817,10 @@ void PPCAIXAsmPrinter::EmitFunctionDescriptor() {
   OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
                          PointerSize);
   // Emit TOC base address.
-  MCSymbol *TOCBaseSym = OutContext.getOrCreateSymbol(StringRef("TOC[TC0]"));
+  const MCSectionXCOFF *TOCBaseSec = OutStreamer->getContext().getXCOFFSection(
+      StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
+      SectionKind::getData());
+  const MCSymbol *TOCBaseSym = TOCBaseSec->getQualNameSymbol();
   OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
                          PointerSize);
   // Emit a null environment pointer.
@@ -1820,10 +1839,84 @@ void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) {
   MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection(
       StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
       SectionKind::getData());
+  // The TOC-base always has 0 size, but 4 byte alignment.
+  TOCBaseSection->setAlignment(Align(4));
   // Switch to section to emit TOC base.
   OutStreamer->SwitchSection(TOCBaseSection);
+
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  for (auto &I : TOC) {
+    // Setup the csect for the current TC entry.
+    MCSectionXCOFF *TCEntry = OutStreamer->getContext().getXCOFFSection(
+        cast<MCSymbolXCOFF>(I.first)->getUnqualifiedName(), XCOFF::XMC_TC,
+        XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData());
+    cast<MCSymbolXCOFF>(I.second)->setContainingCsect(TCEntry);
+    OutStreamer->SwitchSection(TCEntry);
+
+    OutStreamer->EmitLabel(I.second);
+    TS.emitTCEntry(*I.first);
+  }
 }
 
+MCSymbol *
+PPCAIXAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) {
+  const GlobalObject *GO = nullptr;
+
+  // If the MO is a function or certain kind of globals, we want to make sure to
+  // refer to the csect symbol, otherwise we can just do the default handling.
+  if (MO.getType() != MachineOperand::MO_GlobalAddress ||
+      !(GO = dyn_cast<const GlobalObject>(MO.getGlobal())))
+    return PPCAsmPrinter::getMCSymbolForTOCPseudoMO(MO);
+
+  // Do an early error check for globals we don't support. This will go away
+  // eventually.
+  const auto *GV = dyn_cast<const GlobalVariable>(GO);
+  if (GV) {
+    ValidateGV(GV);
+  }
+
+  MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(getSymbol(GO));
+
+  // If the global object is a global variable without initializer or is a
+  // declaration of a function, then XSym is an external referenced symbol.
+  // Hence we may need to explictly create a MCSectionXCOFF for it so that we
+  // can return its symbol later.
+  if (GO->isDeclaration() && !XSym->hasContainingCsect()) {
+    // Make sure the storage class is set.
+    const XCOFF::StorageClass SC =
+        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
+    XSym->setStorageClass(SC);
+
+    MCSectionXCOFF *Csect = OutStreamer->getContext().getXCOFFSection(
+        XSym->getName(), isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA,
+        XCOFF::XTY_ER, SC, SectionKind::getMetadata());
+    XSym->setContainingCsect(Csect);
+
+    return Csect->getQualNameSymbol();
+  }
+
+  // Handle initialized global variables.
+  if (GV) {
+    SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
+
+    // If the operand is a common then we should refer to the csect symbol.
+    if (GVKind.isCommon() || GVKind.isBSSLocal()) {
+      MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+          getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
+      return Csect->getQualNameSymbol();
+    }
+
+    // Other global variables are refered to by labels inside of a single csect,
+    // so refer to the label directly.
+    return getSymbol(GV);
+  }
+
+  // If the MO is a function, we want to make sure to refer to the function
+  // descriptor csect.
+  return XSym->getContainingCsect()->getQualNameSymbol();
+}
 
 /// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
 /// for a MachineFunction to the given output stream, in a format that the
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f95f8be8a0481..a4f662dfdddb6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -52,6 +52,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/CallSite.h"
@@ -1216,6 +1217,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
     break;
@@ -3416,15 +3418,16 @@ SDValue PPCTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  if (Subtarget.isAIXABI())
+    return LowerFormalArguments_AIX(Chain, CallConv, isVarArg, Ins, dl, DAG,
+                                    InVals);
   if (Subtarget.is64BitELFABI())
     return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
                                        InVals);
-  else if (Subtarget.is32BitELFABI())
+  if (Subtarget.is32BitELFABI())
     return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins, dl, DAG,
                                        InVals);
 
-  // FIXME: We are using this for both AIX and Darwin. We should add appropriate
-  // AIX testing, and rename it appropriately.
   return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins, dl, DAG,
                                      InVals);
 }
@@ -5326,16 +5329,19 @@ SDValue PPCTargetLowering::FinishCall(
     GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Callee);
     auto &Context = DAG.getMachineFunction().getMMI().getContext();
 
+    const GlobalObject *GO = cast<GlobalObject>(G->getGlobal());
     MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(Context.getOrCreateSymbol(
-        Twine(".") + Twine(G->getGlobal()->getName())));
-
-    const GlobalValue *GV = G->getGlobal();
-    if (GV && GV->isDeclaration() && !S->hasContainingCsect()) {
-      // On AIX, undefined symbol need to associate with a MCSectionXCOFF to
-      // get the correct storage mapping class. In this case, XCOFF::XMC_PR.
+        Twine(".") + Twine(GO->getName())));
+
+    if (GO && GO->isDeclaration() && !S->hasContainingCsect()) {
+      // On AIX, an undefined symbol needs to be associated with a
+      // MCSectionXCOFF to get the correct storage mapping class.
+      // In this case, XCOFF::XMC_PR.
+      const XCOFF::StorageClass SC =
+          TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
       MCSectionXCOFF *Sec =
           Context.getXCOFFSection(S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER,
-                                  XCOFF::C_EXT, SectionKind::getMetadata());
+                                  SC, SectionKind::getMetadata());
       S->setContainingCsect(Sec);
     }
 
@@ -6803,6 +6809,117 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
   }
 }
 
+static const TargetRegisterClass *getRegClassForSVT(MVT::SimpleValueType SVT,
+                                                    bool IsPPC64) {
+  assert((IsPPC64 || SVT != MVT::i64) &&
+         "i64 should have been split for 32-bit codegen.");
+
+  switch (SVT) {
+  default:
+    report_fatal_error("Unexpected value type for formal argument");
+  case MVT::i1:
+  case MVT::i32:
+  case MVT::i64:
+    return IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+  case MVT::f32:
+    return &PPC::F4RCRegClass;
+  case MVT::f64:
+    return &PPC::F8RCRegClass;
+  }
+}
+
+static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
+                                        SelectionDAG &DAG, SDValue ArgValue,
+                                        MVT LocVT, const SDLoc &dl) {
+  assert(ValVT.isScalarInteger() && LocVT.isScalarInteger());
+  assert(ValVT.getSizeInBits() < LocVT.getSizeInBits());
+
+  if (Flags.isSExt())
+    ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
+                           DAG.getValueType(ValVT));
+  else if (Flags.isZExt())
+    ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
+                           DAG.getValueType(ValVT));
+
+  return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
+}
+
+SDValue PPCTargetLowering::LowerFormalArguments_AIX(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+
+  assert((CallConv == CallingConv::C || CallConv == CallingConv::Cold ||
+          CallConv == CallingConv::Fast) &&
+         "Unexpected calling convention!");
+
+  if (isVarArg)
+    report_fatal_error("This call type is unimplemented on AIX.");
+
+  if (getTargetMachine().Options.GuaranteedTailCallOpt)
+    report_fatal_error("Tail call support is unimplemented on AIX.");
+
+  if (useSoftFloat())
+    report_fatal_error("Soft float support is unimplemented on AIX.");
+
+  const PPCSubtarget &Subtarget =
+      static_cast<const PPCSubtarget &>(DAG.getSubtarget());
+  if (Subtarget.hasQPX())
+    report_fatal_error("QPX support is not supported on AIX.");
+
+  const bool IsPPC64 = Subtarget.isPPC64();
+  const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  MachineFunction &MF = DAG.getMachineFunction();
+  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
+  // On AIX a minimum of 8 words is saved to the parameter save area.
+  const unsigned MinParameterSaveArea = 8 * PtrByteSize;
+  CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
+  CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue ArgValue;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (VA.isRegLoc()) {
+      EVT ValVT = VA.getValVT();
+      MVT LocVT = VA.getLocVT();
+      MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
+      unsigned VReg =
+          MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
+      ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+      if (ValVT.isScalarInteger() &&
+          (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
+        ArgValue =
+            truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
+      }
+      InVals.push_back(ArgValue);
+    } else {
+      report_fatal_error("Handling of formal arguments on the stack is "
+                         "unimplemented!");
+    }
+  }
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea = CCInfo.getNextStackOffset();
+
+  // Set the size that is at least reserved in caller of this function. Tail
+  // call optimized function's reserved stack space needs to be aligned so
+  // that taking the difference between two stack areas will result in an
+  // aligned stack.
+  MinReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setMinReservedArea(MinReservedArea);
+
+  return Chain;
+}
+
 SDValue PPCTargetLowering::LowerCall_AIX(
     SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
     bool isTailCall, bool isPatchPoint,
@@ -14200,7 +14317,8 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR6X:
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
-  case PPC::DIR_PWR9: {
+  case PPC::DIR_PWR9:
+  case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
 
@@ -15379,6 +15497,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       // vector        7       2      2
       return true;
     case PPC::DIR_PWR9:
+    case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
       // vector        7       2      2
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 77b19b2634669..612d1c6b3f26e 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1121,6 +1121,10 @@ namespace llvm {
                               SelectionDAG &DAG, SDValue ArgVal,
                               const SDLoc &dl) const;
 
+    SDValue LowerFormalArguments_AIX(
+        SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+        const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+        SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_Darwin(
         SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
         const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index e94ef4b1e505c..f5e2b473f1ee5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -3757,8 +3757,10 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
     ForwardKilledOperandReg = MI.getOperand(ConstantOpNo).getReg();
 
   unsigned Opc = MI.getOpcode();
-  bool SpecialShift32 =
-    Opc == PPC::SLW || Opc == PPC::SLWo || Opc == PPC::SRW || Opc == PPC::SRWo;
+  bool SpecialShift32 = Opc == PPC::SLW || Opc == PPC::SLWo ||
+                        Opc == PPC::SRW || Opc == PPC::SRWo ||
+                        Opc == PPC::SLW8 || Opc == PPC::SLW8o ||
+                        Opc == PPC::SRW8 || Opc == PPC::SRW8o;
   bool SpecialShift64 =
     Opc == PPC::SLD || Opc == PPC::SLDo || Opc == PPC::SRD || Opc == PPC::SRDo;
   bool SetCR = Opc == PPC::SLWo || Opc == PPC::SRWo ||
diff --git a/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
similarity index 95%
rename from llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
rename to llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 72c347e005192..086db4ef9ec90 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -1,4 +1,4 @@
-//===------ PPCLoopPreIncPrep.cpp - Loop Pre-Inc. AM Prep. Pass -----------===//
+//===------ PPCLoopInstrFormPrep.cpp - Loop Instr Form Prep Pass ----------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -41,7 +41,7 @@
 //      *++p = c;
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-loop-preinc-prep"
+#define DEBUG_TYPE "ppc-loop-instr-form-prep"
 
 #include "PPC.h"
 #include "PPCSubtarget.h"
@@ -148,16 +148,16 @@ namespace {
   // For DQ form instructions, their displacements must be multiple of 16.
   enum InstrForm { UpdateForm = 1, DSForm = 4, DQForm = 16 };
 
-  class PPCLoopPreIncPrep : public FunctionPass {
+  class PPCLoopInstrFormPrep : public FunctionPass {
   public:
     static char ID; // Pass ID, replacement for typeid
 
-    PPCLoopPreIncPrep() : FunctionPass(ID) {
-      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    PPCLoopInstrFormPrep() : FunctionPass(ID) {
+      initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
     }
 
-    PPCLoopPreIncPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
-      initializePPCLoopPreIncPrepPass(*PassRegistry::getPassRegistry());
+    PPCLoopInstrFormPrep(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
+      initializePPCLoopInstrFormPrepPass(*PassRegistry::getPassRegistry());
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -237,20 +237,20 @@ namespace {
 
 } // end anonymous namespace
 
-char PPCLoopPreIncPrep::ID = 0;
-static const char *name = "Prepare loop for pre-inc. addressing modes";
-INITIALIZE_PASS_BEGIN(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+char PPCLoopInstrFormPrep::ID = 0;
+static const char *name = "Prepare loop for ppc preferred instruction forms";
+INITIALIZE_PASS_BEGIN(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(PPCLoopPreIncPrep, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
 
 static const std::string PHINodeNameSuffix    = ".phi";
 static const std::string CastNodeNameSuffix   = ".cast";
 static const std::string GEPNodeIncNameSuffix = ".inc";
 static const std::string GEPNodeOffNameSuffix = ".off";
 
-FunctionPass *llvm::createPPCLoopPreIncPrepPass(PPCTargetMachine &TM) {
-  return new PPCLoopPreIncPrep(TM);
+FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) {
+  return new PPCLoopInstrFormPrep(TM);
 }
 
 static bool IsPtrInBounds(Value *BasePtr) {
@@ -284,7 +284,7 @@ static Value *GetPointerOperand(Value *MemI) {
   return nullptr;
 }
 
-bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
+bool PPCLoopInstrFormPrep::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
@@ -305,7 +305,7 @@ bool PPCLoopPreIncPrep::runOnFunction(Function &F) {
   return MadeChange;
 }
 
-void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
+void PPCLoopInstrFormPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
                                         SmallVector<Bucket, 16> &Buckets,
                                         unsigned MaxCandidateNum) {
   assert((MemI && GetPointerOperand(MemI)) &&
@@ -328,7 +328,7 @@ void PPCLoopPreIncPrep::addOneCandidate(Instruction *MemI, const SCEV *LSCEV,
   }
 }
 
-SmallVector<Bucket, 16> PPCLoopPreIncPrep::collectCandidates(
+SmallVector<Bucket, 16> PPCLoopInstrFormPrep::collectCandidates(
     Loop *L,
     std::function<bool(const Instruction *, const Value *)> isValidCandidate,
     unsigned MaxCandidateNum) {
@@ -369,7 +369,7 @@ SmallVector<Bucket, 16> PPCLoopPreIncPrep::collectCandidates(
   return Buckets;
 }
 
-bool PPCLoopPreIncPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
+bool PPCLoopInstrFormPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
                                                     InstrForm Form) {
   // RemainderOffsetInfo details:
   // key:            value of (Offset urem DispConstraint). For DSForm, it can
@@ -444,7 +444,7 @@ bool PPCLoopPreIncPrep::prepareBaseForDispFormChain(Bucket &BucketChain,
 // {-32769, 2003, 2007, 2011}, we choose -32769 as base offset, and left disp
 // for load/stores are {0, 34772, 34776, 34780}. Though each offset now is a
 // multipler of 4, it cannot be represented by sint16.
-bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
+bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
   // We have a choice now of which instruction's memory operand we use as the
   // base for the generated PHI. Always picking the first instruction in each
   // bucket does not work well, specifically because that instruction might
@@ -484,7 +484,7 @@ bool PPCLoopPreIncPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) {
   return true;
 }
 
-bool PPCLoopPreIncPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
+bool PPCLoopInstrFormPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
                                           SmallSet<BasicBlock *, 16> &BBChanged,
                                           InstrForm Form) {
   bool MadeChange = false;
@@ -676,7 +676,7 @@ bool PPCLoopPreIncPrep::rewriteLoadStores(Loop *L, Bucket &BucketChain,
   return MadeChange;
 }
 
-bool PPCLoopPreIncPrep::updateFormPrep(Loop *L,
+bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L,
                                        SmallVector<Bucket, 16> &Buckets) {
   bool MadeChange = false;
   if (Buckets.empty())
@@ -695,7 +695,7 @@ bool PPCLoopPreIncPrep::updateFormPrep(Loop *L,
   return MadeChange;
 }
 
-bool PPCLoopPreIncPrep::dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets,
+bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets,
                                      InstrForm Form) {
   bool MadeChange = false;
 
@@ -721,7 +721,7 @@ bool PPCLoopPreIncPrep::dispFormPrep(Loop *L, SmallVector<Bucket, 16> &Buckets,
 // This function will check to see if that PHI already exists and will return
 // true if it found an existing PHI with the matched start and increment as the
 // one we wanted to create.
-bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
+bool PPCLoopInstrFormPrep::alreadyPrepared(Loop *L, Instruction* MemI,
                                         const SCEV *BasePtrStartSCEV,
                                         const SCEVConstant *BasePtrIncSCEV,
                                         InstrForm Form) {
@@ -787,7 +787,7 @@ bool PPCLoopPreIncPrep::alreadyPrepared(Loop *L, Instruction* MemI,
   return false;
 }
 
-bool PPCLoopPreIncPrep::runOnLoop(Loop *L) {
+bool PPCLoopInstrFormPrep::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
   // Only prep. the inner-most loop
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 6aa2fdcbec822..7eeff007b78fb 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -18,6 +18,7 @@
 //
 //===---------------------------------------------------------------------===//
 
+#include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPC.h"
 #include "PPCInstrBuilder.h"
@@ -806,6 +807,143 @@ bool PPCMIPeephole::simplifyCode(void) {
                       combineSEXTAndSHL(MI, ToErase);
         break;
       }
+      case PPC::RLWINM:
+      case PPC::RLWINMo:
+      case PPC::RLWINM8:
+      case PPC::RLWINM8o: {
+        unsigned FoldingReg = MI.getOperand(1).getReg();
+        if (!Register::isVirtualRegister(FoldingReg))
+          break;
+
+        MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
+        if (SrcMI->getOpcode() != PPC::RLWINM &&
+            SrcMI->getOpcode() != PPC::RLWINMo &&
+            SrcMI->getOpcode() != PPC::RLWINM8 &&
+            SrcMI->getOpcode() != PPC::RLWINM8o)
+          break;
+        assert((MI.getOperand(2).isImm() && MI.getOperand(3).isImm() &&
+                MI.getOperand(4).isImm() && SrcMI->getOperand(2).isImm() &&
+                SrcMI->getOperand(3).isImm() && SrcMI->getOperand(4).isImm()) &&
+               "Invalid PPC::RLWINM Instruction!");
+        uint64_t SHSrc = SrcMI->getOperand(2).getImm();
+        uint64_t SHMI = MI.getOperand(2).getImm();
+        uint64_t MBSrc = SrcMI->getOperand(3).getImm();
+        uint64_t MBMI = MI.getOperand(3).getImm();
+        uint64_t MESrc = SrcMI->getOperand(4).getImm();
+        uint64_t MEMI = MI.getOperand(4).getImm();
+
+        assert((MEMI < 32 && MESrc < 32 && MBMI < 32 && MBSrc < 32) &&
+               "Invalid PPC::RLWINM Instruction!");
+
+        // If MBMI is bigger than MEMI, we always can not get run of ones.
+        // RotatedSrcMask non-wrap:
+        //                 0........31|32........63
+        // RotatedSrcMask:   B---E        B---E
+        // MaskMI:         -----------|--E  B------
+        // Result:           -----          ---      (Bad candidate)
+        //
+        // RotatedSrcMask wrap:
+        //                 0........31|32........63
+        // RotatedSrcMask: --E   B----|--E    B----
+        // MaskMI:         -----------|--E  B------
+        // Result:         ---   -----|---    -----  (Bad candidate)
+        //
+        // One special case is RotatedSrcMask is a full set mask.
+        // RotatedSrcMask full:
+        //                 0........31|32........63
+        // RotatedSrcMask: ------EB---|-------EB---
+        // MaskMI:         -----------|--E  B------
+        // Result:         -----------|---  -------  (Good candidate)
+
+        // Mark special case.
+        bool SrcMaskFull = (MBSrc - MESrc == 1) || (MBSrc == 0 && MESrc == 31);
+
+        // For other MBMI > MEMI cases, just return.
+        if ((MBMI > MEMI) && !SrcMaskFull)
+          break;
+
+        // Handle MBMI <= MEMI cases.
+        APInt MaskMI = APInt::getBitsSetWithWrap(32, 32 - MEMI - 1, 32 - MBMI);
+        // In MI, we only need low 32 bits of SrcMI, just consider about low 32
+        // bit of SrcMI mask. Note that in APInt, lowerest bit is at index 0,
+        // while in PowerPC ISA, lowerest bit is at index 63.
+        APInt MaskSrc =
+            APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
+        // Current APInt::getBitsSetWithWrap sets all bits to 0 if loBit is
+        // equal to highBit.
+        // If MBSrc - MESrc == 1, we expect a full set mask instead of Null.
+        if (SrcMaskFull && (MBSrc - MESrc == 1))
+          MaskSrc.setAllBits();
+
+        APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
+        APInt FinalMask = RotatedSrcMask & MaskMI;
+        uint32_t NewMB, NewME;
+
+        // If final mask is 0, MI result should be 0 too.
+        if (FinalMask.isNullValue()) {
+          bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 ||
+                          MI.getOpcode() == PPC::RLWINM8o);
+
+          LLVM_DEBUG(dbgs() << "Replace Instr: ");
+          LLVM_DEBUG(MI.dump());
+
+          if (MI.getOpcode() == PPC::RLWINM || MI.getOpcode() == PPC::RLWINM8) {
+            // Replace MI with "LI 0"
+            MI.RemoveOperand(4);
+            MI.RemoveOperand(3);
+            MI.RemoveOperand(2);
+            MI.getOperand(1).ChangeToImmediate(0);
+            MI.setDesc(TII->get(Is64Bit ? PPC::LI8 : PPC::LI));
+          } else {
+            // Replace MI with "ANDIo reg, 0"
+            MI.RemoveOperand(4);
+            MI.RemoveOperand(3);
+            MI.getOperand(2).setImm(0);
+            MI.setDesc(TII->get(Is64Bit ? PPC::ANDIo8 : PPC::ANDIo));
+          }
+          Simplified = true;
+          NumRotatesCollapsed++;
+
+          LLVM_DEBUG(dbgs() << "With: ");
+          LLVM_DEBUG(MI.dump());
+        } else if (isRunOfOnes((unsigned)(FinalMask.getZExtValue()), NewMB,
+                               NewME) || SrcMaskFull) {
+          // If FoldingReg has only one use and it it not RLWINMo and
+          // RLWINM8o, safe to delete its def SrcMI. Otherwise keep it.
+          if (MRI->hasOneNonDBGUse(FoldingReg) &&
+              (SrcMI->getOpcode() == PPC::RLWINM ||
+               SrcMI->getOpcode() == PPC::RLWINM8)) {
+            ToErase = SrcMI;
+            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
+            LLVM_DEBUG(SrcMI->dump());
+          }
+
+          LLVM_DEBUG(dbgs() << "Converting Instr: ");
+          LLVM_DEBUG(MI.dump());
+
+          uint16_t NewSH = (SHSrc + SHMI) % 32;
+          MI.getOperand(2).setImm(NewSH);
+          // If SrcMI mask is full, no need to update MBMI and MEMI.
+          if (!SrcMaskFull) {
+            MI.getOperand(3).setImm(NewMB);
+            MI.getOperand(4).setImm(NewME);
+          }
+          MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+          if (SrcMI->getOperand(1).isKill()) {
+            MI.getOperand(1).setIsKill(true);
+            SrcMI->getOperand(1).setIsKill(false);
+          } else
+            // About to replace MI.getOperand(1), clear its kill flag.
+            MI.getOperand(1).setIsKill(false);
+
+          Simplified = true;
+          NumRotatesCollapsed++;
+
+          LLVM_DEBUG(dbgs() << "To: ");
+          LLVM_DEBUG(MI.dump());
+        }
+        break;
+      }
       }
     }
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index dcf64a5d6f9b8..7266d82a08b54 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -57,6 +57,7 @@ namespace PPC {
     DIR_PWR7,
     DIR_PWR8,
     DIR_PWR9,
+    DIR_PWR_FUTURE,
     DIR_64
   };
 }
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index d548e7ace68da..35f6d32a07db2 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -51,8 +51,8 @@ opt<bool> DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
                         cl::desc("Disable CTR loops for PPC"));
 
 static cl::
-opt<bool> DisablePreIncPrep("disable-ppc-preinc-prep", cl::Hidden,
-                            cl::desc("Disable PPC loop preinc prep"));
+opt<bool> DisableInstrFormPrep("disable-ppc-instr-form-prep", cl::Hidden,
+                            cl::desc("Disable PPC loop instr form prep"));
 
 static cl::opt<bool>
 VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
@@ -104,7 +104,7 @@ extern "C" void LLVMInitializePowerPCTarget() {
 #ifndef NDEBUG
   initializePPCCTRLoopsVerifyPass(PR);
 #endif
-  initializePPCLoopPreIncPrepPass(PR);
+  initializePPCLoopInstrFormPrepPass(PR);
   initializePPCTOCRegDepsPass(PR);
   initializePPCEarlyReturnPass(PR);
   initializePPCVSXCopyPass(PR);
@@ -431,8 +431,8 @@ void PPCPassConfig::addIRPasses() {
 }
 
 bool PPCPassConfig::addPreISel() {
-  if (!DisablePreIncPrep && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
+  if (!DisableInstrFormPrep && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCLoopInstrFormPrepPass(getPPCTargetMachine()));
 
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
     addPass(createHardwareLoopsPass());
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 380d718885251..7079498cd815e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -651,8 +651,9 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
 
   // On P7, P8 or P9 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
+  // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -684,8 +685,9 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b9aa5cf32b590..1e562f3f54b59 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -581,10 +581,7 @@ SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
   int64_t Offset = N->getOffset();
   MVT XLenVT = Subtarget.getXLenVT();
 
-  // Non-PIC TLS lowering should always use the LocalExec model.
-  TLSModel::Model Model = isPositionIndependent()
-                              ? getTargetMachine().getTLSModel(N->getGlobal())
-                              : TLSModel::LocalExec;
+  TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
 
   SDValue Addr;
   switch (Model) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index fe38c4ff02d33..b5343e8a83098 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -231,6 +231,9 @@ def : Pat<(fabs FPR64:$rs1), (FSGNJX_D $rs1, $rs1)>;
 
 def : PatFpr64Fpr64<fcopysign, FSGNJ_D>;
 def : Pat<(fcopysign FPR64:$rs1, (fneg FPR64:$rs2)), (FSGNJN_D $rs1, $rs2)>;
+def : Pat<(fcopysign FPR64:$rs1, FPR32:$rs2), (FSGNJ_D $rs1, (FCVT_D_S $rs2))>;
+def : Pat<(fcopysign FPR32:$rs1, FPR64:$rs2), (FSGNJ_S $rs1, (FCVT_S_D $rs2,
+                                                              0b111))>;
 
 // fmadd: rs1 * rs2 + rs3
 def : Pat<(fma FPR64:$rs1, FPR64:$rs2, FPR64:$rs3),
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
index 91cb35dd72f26..c5cce39747a9e 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
@@ -41,8 +41,12 @@ void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
 
 void SystemZInstPrinter::printOperand(const MCOperand &MO, const MCAsmInfo *MAI,
                                       raw_ostream &O) {
-  if (MO.isReg())
-    O << '%' << getRegisterName(MO.getReg());
+  if (MO.isReg()) {
+    if (!MO.getReg())
+      O << '0';
+    else
+      O << '%' << getRegisterName(MO.getReg());
+  }
   else if (MO.isImm())
     O << MO.getImm();
   else if (MO.isExpr())
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f8adca740a681..7994176c4c265 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -3609,7 +3609,7 @@ SDValue SystemZTargetLowering::lowerCTPOP(SDValue Op,
 
   // Get the known-zero mask for the operand.
   KnownBits Known = DAG.computeKnownBits(Op);
-  unsigned NumSignificantBits = (~Known.Zero).getActiveBits();
+  unsigned NumSignificantBits = Known.getMaxValue().getActiveBits();
   if (NumSignificantBits == 0)
     return DAG.getConstant(0, DL, VT);
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 8b334756611a4..041971ca7cb8f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -2069,7 +2069,7 @@ let Predicates = [FeatureProcessorAssist] in {
     def PPA : SideEffectTernaryRRFc<"ppa", 0xB2E8, GR64, GR64, imm32zx4>;
   def : Pat<(int_s390_ppa_txassist GR32:$src),
             (PPA (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
-                 0, 1)>;
+                 zero_reg, 1)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b5fbbc427a29b..17ce31f01ed75 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3132,6 +3132,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
     if (ForcedVEXEncoding != VEXEncoding_EVEX)
       return Match_Unsupported;
+    break;
   }
 
   return Match_Success;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index f08fcb575bf00..1ccb9b7cbf748 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,6 +12,8 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
@@ -22,6 +24,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -114,12 +117,24 @@ class X86AsmBackend : public MCAsmBackend {
 
     assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
-    // Check that uppper bits are either all zeros or all ones.
-    // Specifically ignore overflow/underflow as long as the leakage is
-    // limited to the lower bits. This is to remain compatible with
-    // other assemblers.
-    assert((Size == 0 || isIntN(Size * 8 + 1, Value)) &&
-           "Value does not fit in the Fixup field");
+    int64_t SignedValue = static_cast<int64_t>(Value);
+    if ((Target.isAbsolute() || IsResolved) &&
+        getFixupKindInfo(Fixup.getKind()).Flags &
+            MCFixupKindInfo::FKF_IsPCRel) {
+      // check that PC relative fixup fits into the fixup size.
+      if (Size > 0 && !isIntN(Size * 8, SignedValue))
+        Asm.getContext().reportError(
+            Fixup.getLoc(), "value of " + Twine(SignedValue) +
+                                " is too large for field of " + Twine(Size) +
+                                ((Size == 1) ? " byte." : " bytes."));
+    } else {
+      // Check that uppper bits are either all zeros or all ones.
+      // Specifically ignore overflow/underflow as long as the leakage is
+      // limited to the lower bits. This is to remain compatible with
+      // other assemblers.
+      assert((Size == 0 || isIntN(Size * 8 + 1, SignedValue)) &&
+             "Value does not fit in the Fixup field");
+    }
 
     for (unsigned i = 0; i != Size; ++i)
       Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 335127c6d0642..0bb23b03685c9 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -101,6 +101,248 @@ namespace X86 {
 
     COND_INVALID
   };
+
+  // The classification for the first instruction in macro fusion.
+  enum class FirstMacroFusionInstKind {
+    // TEST
+    Test,
+    // CMP
+    Cmp,
+    // AND
+    And,
+    // ADD, SUB
+    AddSub,
+    // INC, DEC
+    IncDec,
+    // Not valid as a first macro fusion instruction
+    Invalid
+  };
+
+  enum class SecondMacroFusionInstKind {
+    // JA, JB and variants.
+    AB,
+    // JE, JL, JG and variants.
+    ELG,
+    // JS, JP, JO and variants
+    SPO,
+    // Not a fusible jump.
+    Invalid,
+  };
+
+  /// classifyFirstOpcodeInMacroFusion - return the type of the first
+  /// instruction in macro-fusion.
+  inline FirstMacroFusionInstKind
+  classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
+    switch (Opcode) {
+    default:
+      return FirstMacroFusionInstKind::Invalid;
+    // TEST
+    case X86::TEST16i16:
+    case X86::TEST16mr:
+    case X86::TEST16ri:
+    case X86::TEST16rr:
+    case X86::TEST32i32:
+    case X86::TEST32mr:
+    case X86::TEST32ri:
+    case X86::TEST32rr:
+    case X86::TEST64i32:
+    case X86::TEST64mr:
+    case X86::TEST64ri32:
+    case X86::TEST64rr:
+    case X86::TEST8i8:
+    case X86::TEST8mr:
+    case X86::TEST8ri:
+    case X86::TEST8rr:
+      return FirstMacroFusionInstKind::Test;
+    case X86::AND16i16:
+    case X86::AND16ri:
+    case X86::AND16ri8:
+    case X86::AND16rm:
+    case X86::AND16rr:
+    case X86::AND16rr_REV:
+    case X86::AND32i32:
+    case X86::AND32ri:
+    case X86::AND32ri8:
+    case X86::AND32rm:
+    case X86::AND32rr:
+    case X86::AND32rr_REV:
+    case X86::AND64i32:
+    case X86::AND64ri32:
+    case X86::AND64ri8:
+    case X86::AND64rm:
+    case X86::AND64rr:
+    case X86::AND64rr_REV:
+    case X86::AND8i8:
+    case X86::AND8ri:
+    case X86::AND8ri8:
+    case X86::AND8rm:
+    case X86::AND8rr:
+    case X86::AND8rr_REV:
+      return FirstMacroFusionInstKind::And;
+    // CMP
+    case X86::CMP16i16:
+    case X86::CMP16mr:
+    case X86::CMP16ri:
+    case X86::CMP16ri8:
+    case X86::CMP16rm:
+    case X86::CMP16rr:
+    case X86::CMP16rr_REV:
+    case X86::CMP32i32:
+    case X86::CMP32mr:
+    case X86::CMP32ri:
+    case X86::CMP32ri8:
+    case X86::CMP32rm:
+    case X86::CMP32rr:
+    case X86::CMP32rr_REV:
+    case X86::CMP64i32:
+    case X86::CMP64mr:
+    case X86::CMP64ri32:
+    case X86::CMP64ri8:
+    case X86::CMP64rm:
+    case X86::CMP64rr:
+    case X86::CMP64rr_REV:
+    case X86::CMP8i8:
+    case X86::CMP8mr:
+    case X86::CMP8ri:
+    case X86::CMP8ri8:
+    case X86::CMP8rm:
+    case X86::CMP8rr:
+    case X86::CMP8rr_REV:
+      return FirstMacroFusionInstKind::Cmp;
+    // ADD
+    case X86::ADD16i16:
+    case X86::ADD16ri:
+    case X86::ADD16ri8:
+    case X86::ADD16rm:
+    case X86::ADD16rr:
+    case X86::ADD16rr_REV:
+    case X86::ADD32i32:
+    case X86::ADD32ri:
+    case X86::ADD32ri8:
+    case X86::ADD32rm:
+    case X86::ADD32rr:
+    case X86::ADD32rr_REV:
+    case X86::ADD64i32:
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+    case X86::ADD64rm:
+    case X86::ADD64rr:
+    case X86::ADD64rr_REV:
+    case X86::ADD8i8:
+    case X86::ADD8ri:
+    case X86::ADD8ri8:
+    case X86::ADD8rm:
+    case X86::ADD8rr:
+    case X86::ADD8rr_REV:
+    // SUB
+    case X86::SUB16i16:
+    case X86::SUB16ri:
+    case X86::SUB16ri8:
+    case X86::SUB16rm:
+    case X86::SUB16rr:
+    case X86::SUB16rr_REV:
+    case X86::SUB32i32:
+    case X86::SUB32ri:
+    case X86::SUB32ri8:
+    case X86::SUB32rm:
+    case X86::SUB32rr:
+    case X86::SUB32rr_REV:
+    case X86::SUB64i32:
+    case X86::SUB64ri32:
+    case X86::SUB64ri8:
+    case X86::SUB64rm:
+    case X86::SUB64rr:
+    case X86::SUB64rr_REV:
+    case X86::SUB8i8:
+    case X86::SUB8ri:
+    case X86::SUB8ri8:
+    case X86::SUB8rm:
+    case X86::SUB8rr:
+    case X86::SUB8rr_REV:
+      return FirstMacroFusionInstKind::AddSub;
+    // INC
+    case X86::INC16r:
+    case X86::INC16r_alt:
+    case X86::INC32r:
+    case X86::INC32r_alt:
+    case X86::INC64r:
+    case X86::INC8r:
+    // DEC
+    case X86::DEC16r:
+    case X86::DEC16r_alt:
+    case X86::DEC32r:
+    case X86::DEC32r_alt:
+    case X86::DEC64r:
+    case X86::DEC8r:
+      return FirstMacroFusionInstKind::IncDec;
+    }
+  }
+
+  /// classifySecondCondCodeInMacroFusion - return the type of the second
+  /// instruction in macro-fusion.
+  inline SecondMacroFusionInstKind
+  classifySecondCondCodeInMacroFusion(X86::CondCode CC) {
+    if (CC == X86::COND_INVALID)
+      return SecondMacroFusionInstKind::Invalid;
+
+    switch (CC) {
+    default:
+      return SecondMacroFusionInstKind::Invalid;
+    // JE,JZ
+    case X86::COND_E:
+    // JNE,JNZ
+    case X86::COND_NE:
+    // JL,JNGE
+    case X86::COND_L:
+    // JLE,JNG
+    case X86::COND_LE:
+    // JG,JNLE
+    case X86::COND_G:
+    // JGE,JNL
+    case X86::COND_GE:
+      return SecondMacroFusionInstKind::ELG;
+    // JB,JC
+    case X86::COND_B:
+    // JNA,JBE
+    case X86::COND_BE:
+    // JA,JNBE
+    case X86::COND_A:
+    // JAE,JNC,JNB
+    case X86::COND_AE:
+      return SecondMacroFusionInstKind::AB;
+    // JS
+    case X86::COND_S:
+    // JNS
+    case X86::COND_NS:
+    // JP,JPE
+    case X86::COND_P:
+    // JNP,JPO
+    case X86::COND_NP:
+    // JO
+    case X86::COND_O:
+    // JNO
+    case X86::COND_NO:
+      return SecondMacroFusionInstKind::SPO;
+    }
+  }
+
+  inline bool isMacroFused(FirstMacroFusionInstKind FirstKind,
+                           SecondMacroFusionInstKind SecondKind) {
+    switch (FirstKind) {
+    case X86::FirstMacroFusionInstKind::Test:
+    case X86::FirstMacroFusionInstKind::And:
+      return true;
+    case X86::FirstMacroFusionInstKind::Cmp:
+    case X86::FirstMacroFusionInstKind::AddSub:
+      return SecondKind == X86::SecondMacroFusionInstKind::AB ||
+             SecondKind == X86::SecondMacroFusionInstKind::ELG;
+    case X86::FirstMacroFusionInstKind::IncDec:
+      return SecondKind == X86::SecondMacroFusionInstKind::ELG;
+    case X86::FirstMacroFusionInstKind::Invalid:
+      return false;
+    }
+    llvm_unreachable("unknown fusion type");
+  }
 } // end namespace X86;
 
 /// X86II - This namespace holds all of the target specific flags that
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6840fc12751da..0481a40d462ae 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -150,6 +150,18 @@ void initializeX86ExpandPseudoPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
+namespace X86AS {
+enum : unsigned {
+  GS = 256,
+  FS = 257,
+  SS = 258,
+  PTR32_SPTR = 270,
+  PTR32_UPTR = 271,
+  PTR64 = 272
+};
+} // End X86AS namespace
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 3374cd054a6e1..799c1f5d1285e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1261,7 +1261,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
-      int Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
+      int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
       if (isUInt<32>(Alloc)) {
         BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
             .addImm(Alloc)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index d8f9c5f7270d2..3c33c4bb1f212 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2224,12 +2224,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
 
   unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
-  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
-  if (AddrSpace == 256)
+  if (AddrSpace == X86AS::GS)
     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
-  if (AddrSpace == 257)
+  if (AddrSpace == X86AS::FS)
     AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
-  if (AddrSpace == 258)
+  if (AddrSpace == X86AS::SS)
     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
 
   SDLoc DL(N);
@@ -5222,12 +5221,20 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   }
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
+  case ISD::STRICT_FP_ROUND: {
+    // X87 instructions has enabled these strict fp operation.
+    bool UsingFp80 = Node->getSimpleValueType(0) == MVT::f80 ||
+                     Node->getOperand(1).getSimpleValueType() == MVT::f80;
+    if (UsingFp80 || (!Subtarget->hasSSE1() && Subtarget->hasX87()))
+      break;
+    LLVM_FALLTHROUGH;
+  }
   case ISD::STRICT_FP_TO_SINT:
   case ISD::STRICT_FP_TO_UINT:
-  case ISD::STRICT_FP_ROUND:
     // FIXME: Remove when we have isel patterns for strict versions of these
     // nodes.
-    CurDAG->mutateStrictFPToFP(Node);
+    if (!TLI->isStrictFPEnabled())
+      CurDAG->mutateStrictFPToFP(Node);
     break;
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c658363f8d6a1..a840ca429343b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -277,6 +277,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
   }
 
+  // Handle address space casts between mixed sized pointers.
+  setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+  setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
+
   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
   if (!X86ScalarSSEf64) {
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
@@ -587,6 +591,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FSIN   , VT, Expand);
       setOperationAction(ISD::FCOS   , VT, Expand);
       setOperationAction(ISD::FSINCOS, VT, Expand);
+
+      // Handle constrained floating-point operations of scalar.
+      setOperationAction(ISD::STRICT_FMUL     , VT, Legal);
+      setOperationAction(ISD::STRICT_FDIV     , VT, Legal);
+      setOperationAction(ISD::STRICT_FSQRT    , VT, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
+      // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
+      // as Custom.
+      setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
     }
   }
 
@@ -657,6 +670,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
     setOperationAction(ISD::LRINT, MVT::f80, Expand);
     setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+
+    // Handle constrained floating-point operations of scalar.
+    setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSUB     , MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+    // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
+    // as Custom.
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
   }
 
   // f128 uses xmm registers, but most operations require libcalls.
@@ -690,7 +714,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FSQRT,        MVT::f128, LibCall);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
 
-    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_EXTEND,        MVT::f128, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
     // We need to custom handle any FP_ROUND with an f128 input, but
     // LegalizeDAG uses the result type to know when to run a custom handler.
     // So we have to list all legal floating point result types here.
@@ -2422,6 +2447,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   assert(SrcAS != DestAS && "Expected different address spaces!");
 
+  const TargetMachine &TM = getTargetMachine();
+  if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
+    return false;
+
   return SrcAS < 256 && DestAS < 256;
 }
 
@@ -4978,12 +5007,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
          (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
 }
 
-bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
-                                                 bool IsSigned) const {
-  // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
-  return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
-}
-
 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -19691,15 +19714,20 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   // fp128 needs to use a libcall.
   if (SrcVT == MVT::f128) {
     RTLIB::Libcall LC;
-    if (Op.getOpcode() == ISD::FP_TO_SINT)
+    if (IsSigned)
       LC = RTLIB::getFPTOSINT(SrcVT, VT);
     else
       LC = RTLIB::getFPTOUINT(SrcVT, VT);
 
-    // FIXME: Strict fp!
-    assert(!IsStrict && "Unhandled strict operation!");
+    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
     MakeLibCallOptions CallOptions;
-    return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
+    std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
+                                                  SDLoc(Op), Chain);
+
+    if (IsStrict)
+      return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+    return Tmp.first;
   }
 
   // Fall back to X87.
@@ -19714,9 +19742,11 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op->isStrictFPOpcode();
+
   SDLoc DL(Op);
   MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
+  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
   if (VT == MVT::f128) {
@@ -19725,6 +19755,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   }
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+  // FIXME: Strict fp.
+  assert(!IsStrict && "Strict FP not supported yet!");
 
   return DAG.getNode(X86ISD::VFPEXT, DL, VT,
                      DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
@@ -19732,8 +19764,10 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+  bool IsStrict = Op->isStrictFPOpcode();
+
   MVT VT = Op.getSimpleValueType();
-  SDValue In = Op.getOperand(0);
+  SDValue In = Op.getOperand(IsStrict ? 1 : 0);
   MVT SVT = In.getSimpleValueType();
 
   // It's legal except when f128 is involved
@@ -19745,17 +19779,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   // FP_ROUND node has a second operand indicating whether it is known to be
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
+
+  SDLoc dl(Op);
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   MakeLibCallOptions CallOptions;
-  return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
-}
+  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
+                                                dl, Chain);
 
-// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
-// the default expansion of STRICT_FP_ROUND.
-static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
-  // FIXME: Need to form a libcall with an input chain for f128.
-  assert(Op.getOperand(0).getValueType() != MVT::f128 &&
-         "Don't know how to handle f128 yet!");
-  return Op;
+  if (IsStrict)
+    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+  return Tmp.first;
 }
 
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -24098,7 +24132,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
       MFI.setHasCopyImplyingStackAdjustment(true);
       // Don't do anything here, we will expand these intrinsics out later
       // during FinalizeISel in EmitInstrWithCustomInserter.
-      return SDValue();
+      return Op;
     }
     case Intrinsic::x86_lwpins32:
     case Intrinsic::x86_lwpins64:
@@ -24273,9 +24307,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
 
       MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
       SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+      SDValue Offset = DAG.getUNDEF(VMask.getValueType());
 
-      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
-                                MemIntr->getMemOperand(), true /* truncating */);
+      return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
+                                MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
+                                true /* truncating */);
     }
     case X86ISD::VTRUNCUS:
     case X86ISD::VTRUNCS: {
@@ -27586,12 +27622,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
     if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
       return Op;
 
-    SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
-                                        N->getBasePtr(), Mask,
-                                        getZeroVector(VT, Subtarget, DAG, dl),
-                                        N->getMemoryVT(), N->getMemOperand(),
-                                        N->getExtensionType(),
-                                        N->isExpandingLoad());
+    SDValue NewLoad = DAG.getMaskedLoad(
+        VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+        getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
+        N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
+        N->isExpandingLoad());
     // Emit a blend.
     SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
                                  PassThru);
@@ -27625,11 +27660,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
 
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
-  SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
-                                      N->getBasePtr(), Mask, PassThru,
-                                      N->getMemoryVT(), N->getMemOperand(),
-                                      N->getExtensionType(),
-                                      N->isExpandingLoad());
+  SDValue NewLoad = DAG.getMaskedLoad(
+      WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+      PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+      N->getExtensionType(), N->isExpandingLoad());
 
   SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
                                NewLoad.getValue(0),
@@ -27675,7 +27709,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
   Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
   return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
-                            Mask, N->getMemoryVT(), N->getMemOperand(),
+                            N->getOffset(), Mask, N->getMemoryVT(),
+                            N->getMemOperand(), N->getAddressingMode(),
                             N->isTruncatingStore(), N->isCompressingStore());
 }
 
@@ -27729,6 +27764,29 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
 }
 
+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue Src = Op.getOperand(0);
+  MVT DstVT = Op.getSimpleValueType();
+
+  AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+  unsigned SrcAS = N->getSrcAddressSpace();
+
+  assert(SrcAS != N->getDestAddressSpace() &&
+         "addrspacecast must be between different address spaces");
+
+  if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+  } else if (DstVT == MVT::i64) {
+    Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+  } else if (DstVT == MVT::i32) {
+    Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+  } else {
+    report_fatal_error("Bad address space in addrspacecast");
+  }
+  return Op;
+}
+
 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
                                                     SelectionDAG &DAG) const {
   // TODO: Eventually, the lowering of these nodes should be informed by or
@@ -27773,9 +27831,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
 
 SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                          RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+
+  bool IsStrict = Op->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
+
+  SDLoc dl(Op);
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   MakeLibCallOptions CallOptions;
-  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+  std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
+                                                CallOptions, dl, Chain);
+
+  if (IsStrict)
+    return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+  return Tmp.first;
 }
 
 /// Provide custom lowering hooks for some operations.
@@ -27825,9 +27895,10 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::STRICT_FP_TO_UINT:  return LowerFP_TO_INT(Op, DAG);
-  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
-  case ISD::FP_ROUND:           return LowerFP_ROUND(Op, DAG);
-  case ISD::STRICT_FP_ROUND:    return LowerSTRICT_FP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
+  case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FADD:
@@ -27902,6 +27973,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GC_TRANSITION_START:
                                 return LowerGC_TRANSITION_START(Op, DAG);
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION_END(Op, DAG);
+  case ISD::ADDRSPACECAST:
+    return LowerADDRSPACECAST(Op, DAG);
   }
 }
 
@@ -28691,6 +28764,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(Res.getValue(1));
     return;
   }
+  case ISD::ADDRSPACECAST: {
+    SDValue Src = N->getOperand(0);
+    EVT DstVT = N->getValueType(0);
+    AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+    unsigned SrcAS = CastN->getSrcAddressSpace();
+
+    assert(SrcAS != CastN->getDestAddressSpace() &&
+           "addrspacecast must be between different address spaces");
+
+    SDValue Res;
+    if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
+      Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+    else if (DstVT == MVT::i64)
+      Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+    else if (DstVT == MVT::i32)
+      Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+    else
+      report_fatal_error("Unrecognized addrspacecast type legalization");
+
+    Results.push_back(Res);
+    return;
+  }
   }
 }
 
@@ -40433,6 +40528,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
 static SDValue
 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
+  assert(ML->isUnindexed() && "Unexpected indexed masked load!");
   // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
   // However, some target hooks may need to be added to know when the transform
   // is profitable. Endianness would also have to be considered.
@@ -40460,6 +40556,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
 static SDValue
 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI) {
+  assert(ML->isUnindexed() && "Unexpected indexed masked load!");
   if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
     return SDValue();
 
@@ -40495,10 +40592,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
 
   // The new masked load has an undef pass-through operand. The select uses the
   // original pass-through operand.
-  SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
-                                    ML->getMask(), DAG.getUNDEF(VT),
-                                    ML->getMemoryVT(), ML->getMemOperand(),
-                                    ML->getExtensionType());
+  SDValue NewML = DAG.getMaskedLoad(
+      VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
+      DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
+      ML->getAddressingMode(), ML->getExtensionType());
   SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
                                 ML->getPassThru());
 
@@ -40584,8 +40681,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             Mst->getMemoryVT())) {
     return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
-                              Mst->getBasePtr(), Mask,
-                              Mst->getMemoryVT(), Mst->getMemOperand(), true);
+                              Mst->getBasePtr(), Mst->getOffset(), Mask,
+                              Mst->getMemoryVT(), Mst->getMemOperand(),
+                              Mst->getAddressingMode(), true);
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 576f2fa627cce..82f56f895a191 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1126,9 +1126,6 @@ namespace llvm {
     bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                 SDValue C) const override;
 
-    bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
-                                  bool IsSigned) const override;
-
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 5051d5453f3ac..5917894dd3ee7 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2078,7 +2078,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                       "$cc, $src2, $src1", "$src1, $src2, $cc",
                       (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
                       (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                                 timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
+                                 timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
   let mayLoad = 1 in
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
@@ -2089,8 +2089,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                         timm:$cc),
                     (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
                         timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
-                    Sched<[sched.Folded, sched.ReadAfterFold]>;
+                    Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
+  let Uses = [MXCSR] in
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),
                      (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
@@ -2111,7 +2112,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                 [(set _.KRC:$dst, (OpNode _.FRC:$src1,
                                           _.FRC:$src2,
                                           timm:$cc))]>,
-                EVEX_4V, VEX_LIG, Sched<[sched]>;
+                EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
     def rm : AVX512Ii8<0xC2, MRMSrcMem,
               (outs _.KRC:$dst),
               (ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -2121,7 +2122,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                                         (_.ScalarLdFrag addr:$src2),
                                         timm:$cc))]>,
               EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
-              Sched<[sched.Folded, sched.ReadAfterFold]>;
+              Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
 
@@ -2522,6 +2523,7 @@ def X86cmpm_imm_commute : SDNodeXForm<timm, [{
 
 multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                               string Name> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm  rri  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                    (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
                    "vcmp"#_.Suffix,
@@ -2553,6 +2555,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
                             (_.VT (_.BroadcastLdFrag addr:$src2)),
                             timm:$cc)>,
                 EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+  }
 
   // Patterns for selecting with loads in other operand.
   def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
@@ -2582,6 +2585,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
 
 multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   // comparison code form (VCMP[EQ/LT/LE/...]
+  let Uses = [MXCSR] in
   defm  rrib  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
                      (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
                      "vcmp"#_.Suffix,
@@ -2639,7 +2643,7 @@ def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2),
 multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  Predicate prd> {
-  let Predicates = [prd], ExeDomain = _.ExeDomain in {
+  let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
       def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2679,7 +2683,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
 multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                                  X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                  string mem>{
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5278,7 +5282,7 @@ defm : avx512_logical_lowering_types<"VPANDN", X86andnp>;
 multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                             SDNode OpNode, SDNode VecNode,
                             X86FoldableSchedWrite sched, bit IsCommutable> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -5312,7 +5316,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
 multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                                   SDNode VecNode, X86FoldableSchedWrite sched,
                                   bit IsCommutable = 0> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                           (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                           "$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -5329,16 +5333,17 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
-                           Sched<[sched]>;
+                           Sched<[sched]>, SIMD_EXC;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
                                         _.ScalarIntMemCPat:$src2))>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
-  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  let isCodeGenOnly = 1, Predicates = [HasAVX512],
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5356,6 +5361,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
   }
 
+  let Uses = [MXCSR] in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -5429,27 +5435,28 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
 }
 defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
                                          SchedWriteFCmp.Scl, "VMINCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
                                          SchedWriteFCmp.Scl, "VMINCSD">, XD,
                                          VEX_W, EVEX_4V, VEX_LIG,
-                                         EVEX_CD8<64, CD8VT1>;
+                                         EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
                                          SchedWriteFCmp.Scl, "VMAXCSS">, XS,
-                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+                                         EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
 
 defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
                                          SchedWriteFCmp.Scl, "VMAXCSD">, XD,
                                          VEX_W, EVEX_4V, VEX_LIG,
-                                         EVEX_CD8<64, CD8VT1>;
+                                         EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
                             bit IsCommutable,
                             bit IsKCommutable = IsCommutable> {
-  let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+  let ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -5476,7 +5483,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
 multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
                                   SDPatternOperator OpNodeRnd,
                                   X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -5487,7 +5494,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
 multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
                                 SDPatternOperator OpNodeSAE,
                                 X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -5526,6 +5533,7 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
   }
 }
 
+let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                    X86SchedWriteSizes sched> {
   defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
@@ -5536,6 +5544,7 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
                                     EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
+let Uses = [MXCSR] in
 multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
                                  X86SchedWriteSizes sched> {
   defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
@@ -5570,6 +5579,7 @@ let isCodeGenOnly = 1 in {
   defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
                                  SchedWriteFCmpSizes, 1>;
 }
+let Uses = []<Register>, mayRaiseFPException = 0 in {
 defm VAND  : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
 defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
@@ -5578,10 +5588,11 @@ defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
 defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
+}
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -5603,7 +5614,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -6399,7 +6410,8 @@ let Predicates = [HasAVX512] in {
 multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6425,7 +6437,8 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR] in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6473,7 +6486,8 @@ defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR
 multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6500,7 +6514,8 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR] in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6548,7 +6563,8 @@ defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubR
 multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6578,7 +6594,8 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86FoldableSchedWrite sched,
                                  X86VectorVTInfo _, string Suff> {
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+      Uses = [MXCSR] in
   defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6630,14 +6647,15 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
   defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>;
+          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
 
   let mayLoad = 1 in
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
-          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>;
+          AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
+  let Uses = [MXCSR] in
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
          OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
@@ -6648,13 +6666,14 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
                      !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>;
+                     !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
     def m     : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
                     (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
                     !strconcat(OpcodeStr,
                                "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>;
+                    [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
 
+    let Uses = [MXCSR] in
     def rb    : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
                      (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
                      !strconcat(OpcodeStr,
@@ -6997,7 +7016,10 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
 multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched,
                     RegisterClass SrcRC, X86VectorVTInfo DstVT,
                     X86MemOperand x86memop, PatFrag ld_frag, string asm,
-                    string mem> {
+                    string mem, list<Register> _Uses = [MXCSR],
+                    bit _mayRaiseFPException = 1> {
+let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
+    mayRaiseFPException = _mayRaiseFPException in {
   let hasSideEffects = 0, isCodeGenOnly = 1 in {
     def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
               (ins DstVT.FRC:$src1, SrcRC:$src),
@@ -7023,6 +7045,7 @@ multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSched
                       (OpNode (DstVT.VT DstVT.RC:$src1),
                                (ld_frag addr:$src2)))]>,
                 EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
   def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   (!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
                   DstVT.RC:$src1, SrcRC:$src2), 0, "att">;
@@ -7032,6 +7055,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
                                X86FoldableSchedWrite sched, RegisterClass SrcRC,
                                X86VectorVTInfo DstVT, string asm,
                                string mem> {
+  let ExeDomain = DstVT.ExeDomain, Uses = [MXCSR] in
   def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
               (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
               !strconcat(asm,
@@ -7066,7 +7090,7 @@ defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
                                  XS, VEX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTSI2SDZ  : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
-                                 v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">,
+                                 v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>,
                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
                                  WriteCvtI2SD, GR64,
@@ -7105,7 +7129,7 @@ defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
                                   XS, VEX_W, EVEX_CD8<64, CD8VT1>;
 defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
-                                  i32mem, loadi32, "cvtusi2sd", "l">,
+                                  i32mem, loadi32, "cvtusi2sd", "l", [], 0>,
                                   XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
 defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
                                   WriteCvtI2SD, GR64,
@@ -7145,11 +7169,12 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                                   SDNode OpNodeRnd,
                                   X86FoldableSchedWrite sched, string asm,
                                   string aliasStr> {
-  let Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
     def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>,
-                EVEX, VEX_LIG, Sched<[sched]>;
+                EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+    let Uses = [MXCSR] in
     def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
                  !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
                  [(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
@@ -7159,7 +7184,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
                       (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
-                EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   } // Predicates = [HasAVX512]
 
   def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
@@ -7286,22 +7311,23 @@ multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
                             X86VectorVTInfo _DstRC, SDNode OpNode,
                             SDNode OpNodeInt, SDNode OpNodeSAE,
                             X86FoldableSchedWrite sched, string aliasStr>{
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
   let isCodeGenOnly = 1 in {
   def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
-              EVEX, VEX_LIG, Sched<[sched]>;
+              EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
   def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
-              EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 
   def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
             !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
            [(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.RC:$src)))]>,
-           EVEX, VEX_LIG, Sched<[sched]>;
+           EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+  let Uses = [MXCSR] in
   def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
             !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
             [(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
@@ -7311,7 +7337,7 @@ let Predicates = [HasAVX512] in {
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst,
                 (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
-              EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+              EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 } //HasAVX512
 
   def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
@@ -7353,6 +7379,7 @@ defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                 X86VectorVTInfo _Src, SDNode OpNode,
                                 X86FoldableSchedWrite sched> {
@@ -7387,6 +7414,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                     X86VectorVTInfo _Src, SDNode OpNodeSAE,
                                     X86FoldableSchedWrite sched> {
+  let Uses = [MXCSR] in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -7399,6 +7427,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                    X86VectorVTInfo _Src, SDNode OpNodeRnd,
                                    X86FoldableSchedWrite sched> {
+  let Uses = [MXCSR] in
   defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -7472,7 +7501,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                           string Alias = "", X86MemOperand MemOp = _Src.MemOp,
                           RegisterClass MaskRC = _.KRCWM,
                           dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
-
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _Src.RC:$src),
                          (ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src),
@@ -7512,11 +7541,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                   _.RC:$src0),
                          vselect, "$src0 = $dst">,
                          EVEX, EVEX_B, Sched<[sched.Folded]>;
+  }
 }
 // Coversion with SAE - suppress all exceptions
 multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                               X86VectorVTInfo _Src, SDNode OpNodeSAE,
                               X86FoldableSchedWrite sched> {
+  let Uses = [MXCSR] in
   defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _Src.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
@@ -7528,6 +7559,7 @@ multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd,
                          X86FoldableSchedWrite sched> {
+  let Uses = [MXCSR] in
   defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src", "$src, $rc",
@@ -7716,6 +7748,7 @@ let Predicates = [HasVLX] in {
 }
 
 // Convert Signed/Unsigned Doubleword to Double
+let Uses = []<Register>, mayRaiseFPException = 0 in
 multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            SDNode OpNode128, X86SchedWriteWidths sched> {
   // No rounding in this op
@@ -8521,6 +8554,7 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
 // Half precision conversion instructions
 //===----------------------------------------------------------------------===//
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop, PatFrag ld_frag,
                            X86FoldableSchedWrite sched> {
@@ -8537,6 +8571,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 
 multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                                X86FoldableSchedWrite sched> {
+  let Uses = [MXCSR] in
   defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
                              (ins _src.RC:$src), "vcvtph2ps",
                              "{sae}, $src", "$src, {sae}",
@@ -8568,7 +8603,7 @@ let Predicates = [HasVLX] in {
 
 multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                            X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
-let ExeDomain = GenericDomain in {
+let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
              (ins _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -8605,7 +8640,7 @@ let ExeDomain = GenericDomain in {
 
 multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
                                SchedWrite Sched> {
-  let hasSideEffects = 0 in
+  let hasSideEffects = 0, Uses = [MXCSR] in
   defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
                    (outs _dest.RC:$dst),
                    (ins _src.RC:$src1, i32u8imm:$src2),
@@ -8664,52 +8699,53 @@ let Predicates = [HasVLX] in {
 
 //  Unordered/Ordered scalar fp compare with Sae and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
-                            string OpcodeStr, X86FoldableSchedWrite sched> {
-  let hasSideEffects = 0 in
+                            string OpcodeStr, Domain d,
+                            X86FoldableSchedWrite sched = WriteFCom> {
+  let hasSideEffects = 0, Uses = [MXCSR] in
   def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
                   !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
                   EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
-  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>,
+  defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSEPackedSingle>,
                                    AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
-  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>,
+  defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSEPackedDouble>,
                                    AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
-  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>,
+  defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSEPackedSingle>,
                                    AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
-  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>,
+  defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSEPackedDouble>,
                                    AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX512] in {
   defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
-                                 "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                                 "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
                                  EVEX_CD8<32, CD8VT1>;
   defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", WriteFCom>, PD, EVEX,
+                                  "ucomisd", SSEPackedDouble>, PD, EVEX,
                                   VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   let Pattern = []<dag> in {
     defm VCOMISSZ  : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
-                                   "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                                   "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
                                    EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
-                                   "comisd", WriteFCom>, PD, EVEX,
+                                   "comisd", SSEPackedDouble>, PD, EVEX,
                                     VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
   let isCodeGenOnly = 1 in {
     defm VUCOMISSZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
-                          sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VUCOMISDZ  : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
-                          sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX,
+                          sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX,
                           VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
 
     defm VCOMISSZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
-                          sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
+                          sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
                           EVEX_CD8<32, CD8VT1>;
     defm VCOMISDZ  : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
-                          sse_load_f64, "comisd", WriteFCom>, PD, EVEX,
+                          sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX,
                           VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
   }
 }
@@ -8717,7 +8753,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
+  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -8767,6 +8803,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
+let Uses = [MXCSR] in
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86SchedWriteWidths sched> {
   defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
@@ -8798,12 +8835,12 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          SDNode OpNode, SDNode OpNodeSAE,
                          X86FoldableSchedWrite sched> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
                            (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
-                           Sched<[sched]>;
+                           Sched<[sched]>, SIMD_EXC;
 
   defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -8815,7 +8852,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
 
@@ -8840,7 +8877,7 @@ defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode, X86FoldableSchedWrite sched> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT _.RC:$src))>,
@@ -8862,7 +8899,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 }
 multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode, X86FoldableSchedWrite sched> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
@@ -8923,7 +8960,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                               X86FoldableSchedWrite sched, X86VectorVTInfo _>{
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.VT (fsqrt _.RC:$src))>, EVEX,
@@ -8942,6 +8979,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
   }
 }
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
                                   X86SchedWriteSizes sched> {
   defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
@@ -8967,6 +9005,7 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
   }
 }
 
+let Uses = [MXCSR] in
 multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
                                         X86SchedWriteSizes sched> {
   defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
@@ -8985,13 +9024,14 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrts (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2))>,
-                         Sched<[sched]>;
+                         Sched<[sched]>, SIMD_EXC;
     defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrts (_.VT _.RC:$src1),
                                     _.ScalarIntMemCPat:$src2)>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+    let Uses = [MXCSR] in
     defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
                          "$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -9004,12 +9044,12 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
       def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                 (ins _.FRC:$src1, _.FRC:$src2),
                 OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                Sched<[sched]>;
+                Sched<[sched]>, SIMD_EXC;
       let mayLoad = 1 in
         def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                   (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                   OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                  Sched<[sched.Folded, sched.ReadAfterFold]>;
+                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
     }
   }
 
@@ -9047,8 +9087,9 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                            "$src3, $src2, $src1", "$src1, $src2, $src3",
                            (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                            (i32 timm:$src3)))>,
-                           Sched<[sched]>;
+                           Sched<[sched]>, SIMD_EXC;
 
+  let Uses = [MXCSR] in
   defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
                          "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
@@ -9062,19 +9103,19 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales _.RC:$src1,
                                 _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
-                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+                         Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
     def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
                (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
                OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-               []>, Sched<[sched]>;
+               []>, Sched<[sched]>, SIMD_EXC;
 
     let mayLoad = 1 in
       def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
                  OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-                 []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+                 []>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
   }
 
@@ -10101,7 +10142,7 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
 //all instruction created with FROUND_CURRENT
 multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                       X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
@@ -10127,7 +10168,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
 multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                                           SDNode OpNode, X86FoldableSchedWrite sched,
                                           X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
                       OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
@@ -10160,7 +10201,7 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
 //all instruction created with FROUND_CURRENT
 multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86FoldableSchedWrite sched, X86VectorVTInfo _>{
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10232,7 +10273,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
 //                                      op(reg_vec2,mem_scalar,imm)
 multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                 X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in {
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10254,7 +10295,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode, X86FoldableSchedWrite sched,
                                     X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -10268,7 +10309,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                     X86FoldableSchedWrite sched, X86VectorVTInfo _> {
-  let ExeDomain = _.ExeDomain in
+  let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -10892,10 +10933,12 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load
 // AVX-512 - Unpack Instructions
 //===----------------------------------------------------------------------===//
 
+let Uses = []<Register>, mayRaiseFPException = 0 in {
 defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
                                  SchedWriteFShuffleSizes, 0, 1>;
 defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
                                  SchedWriteFShuffleSizes>;
+}
 
 defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
                                        SchedWriteShuffle, HasBWI>;
@@ -11587,7 +11630,8 @@ let Predicates = [HasVLX] in {
 multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                                   X86FoldableSchedWrite sched, X86VectorVTInfo _,
                                   X86VectorVTInfo TblVT>{
-  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+  let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+      Uses = [MXCSR], mayRaiseFPException = 1 in {
     defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                          OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11619,7 +11663,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
                                       X86FoldableSchedWrite sched,
                                       X86VectorVTInfo _, X86VectorVTInfo TblVT>
   : avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> {
-let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11643,7 +11687,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                       (X86VFixupimms (_.VT _.RC:$src1),
                                      (_.VT _.RC:$src2),
                                      (_src3VT.VT _src3VT.RC:$src3),
-                                     (i32 timm:$src4))>, Sched<[sched]>;
+                                     (i32 timm:$src4))>, Sched<[sched]>, SIMD_EXC;
+    let Uses = [MXCSR] in
     defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
                       OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11661,7 +11706,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                                     (_src3VT.VT (scalar_to_vector
                                               (_src3VT.ScalarLdFrag addr:$src3))),
                                     (i32 timm:$src4))>,
-                     Sched<[sched.Folded, sched.ReadAfterFold]>;
+                     Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
 
@@ -12166,7 +12211,7 @@ defm VGF2P8AFFINEQB    : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
-    Constraints = "$src1 = $dst" in {
+    Constraints = "$src1 = $dst", Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
                     (outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
                     "v4fmaddps", "$src3, $src2", "$src2, $src3",
@@ -12297,17 +12342,19 @@ defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
 // Truncate Float to BFloat16
 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
                              X86SchedWriteWidths sched> {
-  let Predicates = [HasBF16] in {
+  let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
                             X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasBF16, HasVLX] in {
+    let Uses = []<Register>, mayRaiseFPException = 0 in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
                                null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
                                VK4WM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
                                X86cvtneps2bf16,
                                sched.YMM, "{1to8}", "{y}">, EVEX_V256;
+    }
 
     def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
                     (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 0cca71bdc4315..4c84f4f2460db 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -95,7 +95,8 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
                    Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
 }
 
-let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1,
+    Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy, string Suff,
                        PatFrag MemFrag128, PatFrag MemFrag256,
@@ -237,7 +238,7 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
 }
 
 let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
-    hasSideEffects = 0 in
+    hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
@@ -263,7 +264,8 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 // the lowest element of the FMA*_Int instruction. Even though such analysis
 // may be not implemented yet we allow the routines doing the actual commute
 // transformation to decide if one or another instruction is commutable or not.
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0,
+    Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
                         Operand memopr, RegisterClass RC,
                         X86FoldableSchedWrite sched> {
@@ -384,6 +386,7 @@ defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR6
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
 //===----------------------------------------------------------------------===//
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
                  X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
                  PatFrag mem_frag, X86FoldableSchedWrite sched> {
@@ -425,7 +428,8 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                      ValueType VT, X86FoldableSchedWrite sched> {
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0,
+    Uses = [MXCSR], mayRaiseFPException = 1 in {
   def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
@@ -458,6 +462,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
 } // isCodeGenOnly = 1
 }
 
+let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256,
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 1b7a2ccde51fa..d9cf560831300 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -286,26 +286,26 @@ let Uses = [FPCW], mayRaiseFPException = 1 in {
 // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
 // resources.
 let hasNoSchedulingInfo = 1 in {
-defm ADD : FPBinary_rr<fadd>;
-defm SUB : FPBinary_rr<fsub>;
-defm MUL : FPBinary_rr<fmul>;
-defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary_rr<any_fadd>;
+defm SUB : FPBinary_rr<any_fsub>;
+defm MUL : FPBinary_rr<any_fmul>;
+defm DIV : FPBinary_rr<any_fdiv>;
 }
 
 // Sets the scheduling resources for the actual NAME#_F<size>m defintions.
 let SchedRW = [WriteFAddLd] in {
-defm ADD : FPBinary<fadd, MRM0m, "add">;
-defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+defm ADD : FPBinary<any_fadd, MRM0m, "add">;
+defm SUB : FPBinary<any_fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<any_fsub ,MRM5m, "subr", 0>;
 }
 
 let SchedRW = [WriteFMulLd] in {
-defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm MUL : FPBinary<any_fmul, MRM1m, "mul">;
 }
 
 let SchedRW = [WriteFDivLd] in {
-defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+defm DIV : FPBinary<any_fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<any_fdiv, MRM7m, "divr", 0>;
 }
 } // Uses = [FPCW], mayRaiseFPException = 1
 
@@ -366,7 +366,7 @@ defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
 
 let Uses = [FPCW], mayRaiseFPException = 1 in {
 let SchedRW = [WriteFSqrt80] in
-defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+defm SQRT: FPUnary<any_fsqrt,MRM_FA, "fsqrt">;
 
 let SchedRW = [WriteFCom] in {
 let hasSideEffects = 0 in {
@@ -790,19 +790,19 @@ def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
 
 // FP extensions map onto simple pseudo-value conversions if they are to/from
 // the FP stack.
-def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
           Requires<[FPStackf32]>;
-def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+def : Pat<(f80 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
            Requires<[FPStackf32]>;
-def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+def : Pat<(f80 (any_fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
            Requires<[FPStackf64]>;
 
 // FP truncations map onto simple pseudo-value conversions if they are to/from
 // the FP stack.  We have validated that only value-preserving truncations make
 // it through isel.
-def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+def : Pat<(f32 (any_fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
           Requires<[FPStackf32]>;
-def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+def : Pat<(f32 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
            Requires<[FPStackf32]>;
-def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+def : Pat<(f64 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
            Requires<[FPStackf64]>;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index de6f8a81dff65..1a4f7e1e6bbd6 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -706,6 +706,10 @@ def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
 def X86GF2P8affineqb    : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
 def X86GF2P8mulb        : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
 
+def SDTX86MaskedStore: SDTypeProfile<0, 3, [       // masked store
+  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+]>;
+
 //===----------------------------------------------------------------------===//
 // SSE Complex Patterns
 //===----------------------------------------------------------------------===//
@@ -1040,9 +1044,10 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
                                 INSERT_get_vinsert256_imm>;
 
 def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                          (masked_ld node:$src1, node:$src2, node:$src3), [{
+                          (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
   return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
-    cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+    cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+    cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
 
 def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1055,17 +1060,19 @@ def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 }]>;
 
 def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                         (masked_ld node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
+                         (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
+  return cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+         cast<MaskedLoadSDNode>(N)->isUnindexed();
 }]>;
 
 // Masked store fragments.
 // X86mstore can't be implemented in core DAG files because some targets
 // do not support vector types (llvm-tblgen will fail).
 def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                        (masked_st node:$src1, node:$src2, node:$src3), [{
-  return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
-         (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+                        (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+  return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+         !cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+         cast<MaskedStoreSDNode>(N)->isUnindexed();
 }]>;
 
 def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1078,16 +1085,18 @@ def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
 }]>;
 
 def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                             (masked_st node:$src1, node:$src2, node:$src3), [{
-    return cast<MaskedStoreSDNode>(N)->isCompressingStore();
+                             (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+           cast<MaskedStoreSDNode>(N)->isUnindexed();
 }]>;
 
 // masked truncstore fragments
 // X86mtruncstore can't be implemented in core DAG files because some targets
 // doesn't support vector type ( llvm-tblgen will fail)
 def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-                             (masked_st node:$src1, node:$src2, node:$src3), [{
-    return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+                             (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+    return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+           cast<MaskedStoreSDNode>(N)->isUnindexed();
 }]>;
 def masked_truncstorevi8 :
   PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1111,10 +1120,10 @@ def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES",  SDTStore,
 def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS",  SDTStore,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES",  SDTMaskedStore,
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES",  SDTX86MaskedStore,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS",  SDTMaskedStore,
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS",  SDTX86MaskedStore,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b66d9ffd5d5e2..41c6fc4aaf673 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1761,10 +1761,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VCMPPSZ128rrik:
   case X86::VCMPPDZ256rrik:
   case X86::VCMPPSZ256rrik: {
-    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f;
+    unsigned Imm =
+                MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
     Imm = X86::getSwappedVCMPImm(Imm);
     auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+    WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index ffdcb65c93bd9..b8e80bcd566a8 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -823,7 +823,9 @@ let Constraints = "$src1 = $dst" in {
 multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                      SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
                      string asm, string mem, X86FoldableSchedWrite sched,
+                     Domain d,
                      SchedRead Int2Fpu = ReadDefault> {
+  let ExeDomain = d in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpNode SrcRC:$src))]>,
@@ -832,6 +834,7 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
               mem#"\t{$src, $dst|$dst, $src}",
               [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
               Sched<[sched.Folded]>;
+  }
 }
 
 multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
@@ -851,8 +854,8 @@ let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm, string mem,
-                          X86FoldableSchedWrite sched> {
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+                          X86FoldableSchedWrite sched, Domain d> {
+let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
               Sched<[sched, ReadDefault, ReadInt2Fpu]>;
@@ -867,19 +870,19 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
 let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
-                                WriteCvtSS2I>,
+                                WriteCvtSS2I, SSEPackedSingle>,
                                 XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si", "cvttss2si",
-                                WriteCvtSS2I>,
+                                WriteCvtSS2I, SSEPackedSingle>,
                                 XS, VEX, VEX_W, VEX_LIG;
 defm VCVTTSD2SI   : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
-                                WriteCvtSD2I>,
+                                WriteCvtSD2I, SSEPackedDouble>,
                                 XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                                 "cvttsd2si", "cvttsd2si",
-                                WriteCvtSD2I>,
+                                WriteCvtSD2I, SSEPackedDouble>,
                                 XD, VEX, VEX_W, VEX_LIG;
 }
 
@@ -889,13 +892,17 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
 // where appropriate to do so.
 let isCodeGenOnly = 1 in {
 defm VCVTSI2SS   : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
-                                  WriteCvtI2SS>, XS, VEX_4V, VEX_LIG, SIMD_EXC;
+                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
-                                  WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG, SIMD_EXC;
+                                  WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+                                  VEX_W, VEX_LIG, SIMD_EXC;
 defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
-                                  WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
+                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  VEX_LIG;
 defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
-                                  WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG, SIMD_EXC;
+                                  WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+                                  VEX_W, VEX_LIG, SIMD_EXC;
 } // isCodeGenOnly = 1
 
 let Predicates = [UseAVX] in {
@@ -921,28 +928,28 @@ let Predicates = [UseAVX] in {
 let isCodeGenOnly = 1 in {
 defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I>, XS, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
 defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
                       "cvttss2si", "cvttss2si",
-                      WriteCvtSS2I>, XS, REX_W, SIMD_EXC;
+                      WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
 defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I>, XD, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
-                      WriteCvtSD2I>, XD, REX_W, SIMD_EXC;
+                      WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss", "cvtsi2ss{l}",
-                      WriteCvtI2SS, ReadInt2Fpu>, XS, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
 defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
                       "cvtsi2ss", "cvtsi2ss{q}",
-                      WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
+                      WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
 defm CVTSI2SD  : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
                       "cvtsi2sd", "cvtsi2sd{l}",
-                      WriteCvtI2SD, ReadInt2Fpu>, XD;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
 defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
                       "cvtsi2sd", "cvtsi2sd{q}",
-                      WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
+                      WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
 } // isCodeGenOnly = 1
 
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
@@ -951,7 +958,8 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           ValueType DstVT, ValueType SrcVT, SDNode OpNode,
                           Operand memop, ComplexPattern mem_cpat, string asm,
-                          X86FoldableSchedWrite sched> {
+                          X86FoldableSchedWrite sched, Domain d> {
+let ExeDomain = d in {
   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
                   [(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
@@ -961,12 +969,13 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                   [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
                Sched<[sched.Folded]>;
 }
+}
 
 multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
                     RegisterClass DstRC, X86MemOperand x86memop,
                     string asm, string mem, X86FoldableSchedWrite sched,
-                    bit Is2Addr = 1> {
-let hasSideEffects = 0 in {
+                    Domain d, bit Is2Addr = 1> {
+let hasSideEffects = 0, ExeDomain = d in {
   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
                   !if(Is2Addr,
                       !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
@@ -986,36 +995,46 @@ let Uses = [MXCSR], mayRaiseFPException = 1 in {
 let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
                   X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                  WriteCvtSD2I>, XD, VEX, VEX_LIG;
+                  WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
                     X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
-                    WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
+                    WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
 }
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
-                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
+                 sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+                 SSEPackedDouble>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
-                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
+                   sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+                   SSEPackedDouble>, XD, REX_W;
 }
 
 let Predicates = [UseAVX] in {
 defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, SIMD_EXC;
+          i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
+          XS, VEX_4V, VEX_LIG, SIMD_EXC;
 defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
+          i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
+          XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
 defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
+          i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
+          XD, VEX_4V, VEX_LIG;
 defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
+          i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
+          XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
 }
 let Constraints = "$src1 = $dst" in {
   defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS, SIMD_EXC;
+                        i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
+                        XS, SIMD_EXC;
   defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W, SIMD_EXC;
+                        i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
+                        XS, REX_W, SIMD_EXC;
   defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
-                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
+                        i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
+                        XD;
   defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
-                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W, SIMD_EXC;
+                        i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
+                        XD, REX_W, SIMD_EXC;
 }
 
 def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1052,32 +1071,34 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                 ssmem, sse_load_f32, "cvttss2si",
-                                WriteCvtSS2I>, XS, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
 defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                X86cvtts2Int, ssmem, sse_load_f32,
-                               "cvttss2si", WriteCvtSS2I>,
+                               "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
                                XS, VEX, VEX_LIG, VEX_W;
 defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                 sdmem, sse_load_f64, "cvttsd2si",
-                                WriteCvtSS2I>, XD, VEX, VEX_LIG;
+                                WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
 defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                               X86cvtts2Int, sdmem, sse_load_f64,
-                              "cvttsd2si", WriteCvtSS2I>,
+                              "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
                               XD, VEX, VEX_LIG, VEX_W;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
                                     ssmem, sse_load_f32, "cvttss2si",
-                                    WriteCvtSS2I>, XS;
+                                    WriteCvtSS2I, SSEPackedSingle>, XS;
 defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
                                    X86cvtts2Int, ssmem, sse_load_f32,
-                                   "cvttss2si", WriteCvtSS2I>, XS, REX_W;
+                                   "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
+                                   XS, REX_W;
 defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
                                     sdmem, sse_load_f64, "cvttsd2si",
-                                    WriteCvtSD2I>, XD;
+                                    WriteCvtSD2I, SSEPackedDouble>, XD;
 defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
                                   X86cvtts2Int, sdmem, sse_load_f64,
-                                  "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
+                                  "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
+                                  XD, REX_W;
 }
 
 def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1117,18 +1138,18 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
 let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I>, XS, VEX, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                   ssmem, sse_load_f32, "cvtss2si",
-                                  WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
+                                  WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
 }
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
                                ssmem, sse_load_f32, "cvtss2si",
-                               WriteCvtSS2I>, XS;
+                               WriteCvtSS2I, SSEPackedSingle>, XS;
 defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
                                  ssmem, sse_load_f32, "cvtss2si",
-                                 WriteCvtSS2I>, XS, REX_W;
+                                 WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
 
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1815,9 +1836,10 @@ let Constraints = "$src1 = $dst" in {
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                          ValueType vt, X86MemOperand x86memop,
-                         PatFrag ld_frag, string OpcodeStr,
-                         X86FoldableSchedWrite sched> {
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
+                         PatFrag ld_frag, string OpcodeStr, Domain d,
+                         X86FoldableSchedWrite sched = WriteFCom> {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+    ExeDomain = d in {
   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
@@ -1835,8 +1857,9 @@ let mayLoad = 1 in
 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
                              ValueType vt, Operand memop,
                              ComplexPattern mem_cpat, string OpcodeStr,
-                             X86FoldableSchedWrite sched> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+                             Domain d,
+                             X86FoldableSchedWrite sched = WriteFCom> {
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
   def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
@@ -1852,49 +1875,49 @@ let mayLoad = 1 in
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                               "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+                               "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                               "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
+                               "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+                                "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
+                                "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
   }
 
   let isCodeGenOnly = 1 in {
     defm VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                      sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+                      sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
     defm VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                      sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
+                      sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
 
     defm VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                       sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+                       sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                       sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
+                       sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", WriteFCom>, PS;
+                                  "ucomiss", SSEPackedSingle>, PS;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", WriteFCom>, PD;
+                                  "ucomisd", SSEPackedDouble>, PD;
 
   let Pattern = []<dag> in {
     defm COMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                    "comiss", WriteFCom>, PS;
+                                    "comiss", SSEPackedSingle>, PS;
     defm COMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                    "comisd", WriteFCom>, PD;
+                                    "comisd", SSEPackedDouble>, PD;
   }
 
   let isCodeGenOnly = 1 in {
     defm UCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                            sse_load_f32, "ucomiss", WriteFCom>, PS;
+                            sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
     defm UCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                            sse_load_f64, "ucomisd", WriteFCom>, PD;
+                            sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
 
     defm COMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                                sse_load_f32, "comiss", WriteFCom>, PS;
+                                sse_load_f32, "comiss", SSEPackedSingle>, PS;
     defm COMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                                    sse_load_f64, "comisd", WriteFCom>, PD;
+                                    sse_load_f64, "comisd", SSEPackedDouble>, PD;
   }
 } // Defs = [EFLAGS]
 
@@ -5519,7 +5542,7 @@ let ExeDomain = SSEPackedDouble in {
 
 // FP round - roundss, roundps, roundsd, roundpd
 let Predicates = [HasAVX, NoVLX] in {
-  let ExeDomain = SSEPackedSingle in {
+  let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
     // Intrinsic form
     defm VROUNDPS  : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
                                      loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
@@ -5529,7 +5552,7 @@ let Predicates = [HasAVX, NoVLX] in {
                                    VEX, VEX_L, VEX_WIG;
   }
 
-  let ExeDomain = SSEPackedDouble in {
+  let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
     defm VROUNDPD  : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
                                      loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
                                    VEX, VEX_WIG;
@@ -5541,9 +5564,9 @@ let Predicates = [HasAVX, NoVLX] in {
 let Predicates = [UseAVX] in {
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
                                   v4f32, v2f64, X86RndScales, 0>,
-                                  VEX_4V, VEX_LIG, VEX_WIG;
+                                  VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
-                                VEX_4V, VEX_LIG, VEX_WIG;
+                                VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
 }
 
 let Predicates = [UseAVX] in {
@@ -7303,12 +7326,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
 }
 
 let Predicates = [HasF16C, NoVLX] in {
-  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
+  defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
-                               WriteCvtPS2PHSt>;
+                               WriteCvtPS2PHSt>, SIMD_EXC;
   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
-                               WriteCvtPS2PHYSt>, VEX_L;
+                               WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
   def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index c6da4b09dd60f..b19d1263e0c91 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86MacroFusion.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MacroFusion.h"
@@ -18,160 +19,13 @@
 
 using namespace llvm;
 
-namespace {
-
-// The classification for the first instruction.
-enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid };
-
-// The classification for the second instruction (jump).
-enum class JumpKind {
-  // JE, JL, JG and variants.
-  ELG,
-  // JA, JB and variants.
-  AB,
-  // JS, JP, JO and variants.
-  SPO,
-  // Not a fusable jump.
-  Invalid,
-};
-
-} // namespace
-
-static FirstInstrKind classifyFirst(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    return FirstInstrKind::Invalid;
-  case X86::TEST8rr:
-  case X86::TEST16rr:
-  case X86::TEST32rr:
-  case X86::TEST64rr:
-  case X86::TEST8ri:
-  case X86::TEST16ri:
-  case X86::TEST32ri:
-  case X86::TEST64ri32:
-  case X86::TEST8mr:
-  case X86::TEST16mr:
-  case X86::TEST32mr:
-  case X86::TEST64mr:
-    return FirstInstrKind::Test;
-  case X86::AND16ri:
-  case X86::AND16ri8:
-  case X86::AND16rm:
-  case X86::AND16rr:
-  case X86::AND32ri:
-  case X86::AND32ri8:
-  case X86::AND32rm:
-  case X86::AND32rr:
-  case X86::AND64ri32:
-  case X86::AND64ri8:
-  case X86::AND64rm:
-  case X86::AND64rr:
-  case X86::AND8ri:
-  case X86::AND8rm:
-  case X86::AND8rr:
-    return FirstInstrKind::And;
-  case X86::CMP16ri:
-  case X86::CMP16ri8:
-  case X86::CMP16rm:
-  case X86::CMP16rr:
-  case X86::CMP16mr:
-  case X86::CMP32ri:
-  case X86::CMP32ri8:
-  case X86::CMP32rm:
-  case X86::CMP32rr:
-  case X86::CMP32mr:
-  case X86::CMP64ri32:
-  case X86::CMP64ri8:
-  case X86::CMP64rm:
-  case X86::CMP64rr:
-  case X86::CMP64mr:
-  case X86::CMP8ri:
-  case X86::CMP8rm:
-  case X86::CMP8rr:
-  case X86::CMP8mr:
-    return FirstInstrKind::Cmp;
-  case X86::ADD16ri:
-  case X86::ADD16ri8:
-  case X86::ADD16ri8_DB:
-  case X86::ADD16ri_DB:
-  case X86::ADD16rm:
-  case X86::ADD16rr:
-  case X86::ADD16rr_DB:
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD32ri8_DB:
-  case X86::ADD32ri_DB:
-  case X86::ADD32rm:
-  case X86::ADD32rr:
-  case X86::ADD32rr_DB:
-  case X86::ADD64ri32:
-  case X86::ADD64ri32_DB:
-  case X86::ADD64ri8:
-  case X86::ADD64ri8_DB:
-  case X86::ADD64rm:
-  case X86::ADD64rr:
-  case X86::ADD64rr_DB:
-  case X86::ADD8ri:
-  case X86::ADD8ri_DB:
-  case X86::ADD8rm:
-  case X86::ADD8rr:
-  case X86::ADD8rr_DB:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB16rm:
-  case X86::SUB16rr:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB32rm:
-  case X86::SUB32rr:
-  case X86::SUB64ri32:
-  case X86::SUB64ri8:
-  case X86::SUB64rm:
-  case X86::SUB64rr:
-  case X86::SUB8ri:
-  case X86::SUB8rm:
-  case X86::SUB8rr:
-    return FirstInstrKind::ALU;
-  case X86::INC16r:
-  case X86::INC32r:
-  case X86::INC64r:
-  case X86::INC8r:
-  case X86::DEC16r:
-  case X86::DEC32r:
-  case X86::DEC64r:
-  case X86::DEC8r:
-    return FirstInstrKind::IncDec;
-  }
+static X86::FirstMacroFusionInstKind classifyFirst(const MachineInstr &MI) {
+  return X86::classifyFirstOpcodeInMacroFusion(MI.getOpcode());
 }
 
-static JumpKind classifySecond(const MachineInstr &MI) {
+static X86::SecondMacroFusionInstKind classifySecond(const MachineInstr &MI) {
   X86::CondCode CC = X86::getCondFromBranch(MI);
-  if (CC == X86::COND_INVALID)
-    return JumpKind::Invalid;
-
-  switch (CC) {
-  default:
-    return JumpKind::Invalid;
-  case X86::COND_E:
-  case X86::COND_NE:
-  case X86::COND_L:
-  case X86::COND_LE:
-  case X86::COND_G:
-  case X86::COND_GE:
-    return JumpKind::ELG;
-  case X86::COND_B:
-  case X86::COND_BE:
-  case X86::COND_A:
-  case X86::COND_AE:
-    return JumpKind::AB;
-  case X86::COND_S:
-  case X86::COND_NS:
-  case X86::COND_P:
-  case X86::COND_NP:
-  case X86::COND_O:
-  case X86::COND_NO:
-    return JumpKind::SPO;
-  }
+  return X86::classifySecondCondCodeInMacroFusion(CC);
 }
 
 /// Check if the instr pair, FirstMI and SecondMI, should be fused
@@ -187,40 +41,27 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
   if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
     return false;
 
-  const JumpKind BranchKind = classifySecond(SecondMI);
+  const X86::SecondMacroFusionInstKind BranchKind = classifySecond(SecondMI);
 
-  if (BranchKind == JumpKind::Invalid)
+  if (BranchKind == X86::SecondMacroFusionInstKind::Invalid)
     return false; // Second cannot be fused with anything.
 
   if (FirstMI == nullptr)
     return true; // We're only checking whether Second can be fused at all.
 
-  const FirstInstrKind TestKind = classifyFirst(*FirstMI);
+  const X86::FirstMacroFusionInstKind TestKind = classifyFirst(*FirstMI);
 
   if (ST.hasBranchFusion()) {
     // Branch fusion can merge CMP and TEST with all conditional jumps.
-    return (TestKind == FirstInstrKind::Cmp ||
-            TestKind == FirstInstrKind::Test);
+    return (TestKind == X86::FirstMacroFusionInstKind::Cmp ||
+            TestKind == X86::FirstMacroFusionInstKind::Test);
   }
 
   if (ST.hasMacroFusion()) {
-    // Macro Fusion rules are a bit more complex. See Agner Fog's
-    // Microarchitecture table 9.2 "Instruction Fusion".
-    switch (TestKind) {
-    case FirstInstrKind::Test:
-    case FirstInstrKind::And:
-      return true;
-    case FirstInstrKind::Cmp:
-    case FirstInstrKind::ALU:
-      return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB;
-    case FirstInstrKind::IncDec:
-      return BranchKind == JumpKind::ELG;
-    case FirstInstrKind::Invalid:
-      return false;
-    }
+    return X86::isMacroFused(TestKind, BranchKind);
   }
 
-  llvm_unreachable("unknown branch fusion type");
+  llvm_unreachable("unknown fusion type");
 }
 
 namespace llvm {
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 3809a14178fdf..f69626b2622e4 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -530,23 +530,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::MXCSR);
 
   // Set the stack-pointer register and its aliases as reserved.
-  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
-       ++I)
-    Reserved.set(*I);
+  for (const MCPhysReg &SubReg : subregs_inclusive(X86::RSP))
+    Reserved.set(SubReg);
 
   // Set the Shadow Stack Pointer as reserved.
   Reserved.set(X86::SSP);
 
   // Set the instruction pointer register and its aliases as reserved.
-  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
-       ++I)
-    Reserved.set(*I);
+  for (const MCPhysReg &SubReg : subregs_inclusive(X86::RIP))
+    Reserved.set(SubReg);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
-    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
-         ++I)
-      Reserved.set(*I);
+    for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP))
+      Reserved.set(SubReg);
   }
 
   // Set the base-pointer register and its aliases as reserved if needed.
@@ -559,9 +556,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "this calling convention.");
 
     Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
-    for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
-         I.isValid(); ++I)
-      Reserved.set(*I);
+    for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr))
+      Reserved.set(SubReg);
   }
 
   // Mark the segment registers as reserved.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index f26c2d4c4a287..3cfaf714e93e8 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -295,8 +295,8 @@ def FPSW : X86Reg<"fpsr", 0>;
 def FPCW : X86Reg<"fpcr", 0>;
 
 // SIMD Floating-point control register.
-// Note: We only model the current rounding modes and the IEEE masks.
-// IEEE flags, FTZ and DAZ are not modeled here.
+// Note: We only model the "Uses" of the control bits: current rounding modes,
+// DAZ, FTZ and exception masks. We don't model the "Defs" of flag bits.
 def MXCSR : X86Reg<"mxcsr", 0>;
 
 // Status flags register.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0b3a5319baac3..f64fedd8cbb6a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2377,6 +2377,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  static const CostTblEntry SLMCostTbl[] = {
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i8,      4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i16,     4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i32,     4 },
+     { ISD::EXTRACT_VECTOR_ELT,       MVT::i64,     7 }
+   };
+
   assert(Val->isVectorTy() && "This must be a vector type");
 
   Type *ScalarType = Val->getScalarType();
@@ -2396,6 +2403,13 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     // Floating point scalars are already located in index #0.
     if (ScalarType->isFloatingPointTy() && Index == 0)
       return 0;
+
+    int ISD = TLI->InstructionOpcodeToISD(Opcode);
+    assert(ISD && "Unexpected vector opcode");
+    MVT MScalarTy = LT.second.getScalarType();
+    if (ST->isSLM())
+      if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
+        return LT.first * Entry->Cost;
   }
 
   // Add to the base cost if we know that the extracted element of a vector is
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 44d73b5ad5c04..48da7e7bdd03c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -308,15 +308,16 @@ static const Value *getPointerOperand(const Instruction *I) {
 
   return nullptr;
 }
-static const Value *getBasePointerOfAccessPointerOperand(const Instruction *I,
-                                                         int64_t &BytesOffset,
-                                                         const DataLayout &DL) {
+static const Value *
+getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
+                                     const DataLayout &DL,
+                                     bool AllowNonInbounds = false) {
   const Value *Ptr = getPointerOperand(I);
   if (!Ptr)
     return nullptr;
 
   return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
-                                          /*AllowNonInbounds*/ false);
+                                          AllowNonInbounds);
 }
 
 ChangeStatus AbstractAttribute::update(Attributor &A) {
@@ -1702,8 +1703,7 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
     return 0;
   }
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-    if (GEP->hasAllZeroIndices() ||
-        (GEP->isInBounds() && GEP->hasAllConstantIndices())) {
+    if (GEP->hasAllConstantIndices()) {
       TrackUse = true;
       return 0;
     }
@@ -1718,6 +1718,18 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
       return std::max(int64_t(0), DerefBytes);
     }
   }
+
+  /// Corner case when an offset is 0.
+  if (const Value *Base = getBasePointerOfAccessPointerOperand(
+          I, Offset, DL, /*AllowNonInbounds*/ true)) {
+    if (Offset == 0 && Base == &AssociatedValue &&
+        getPointerOperand(I) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
   if (const Value *Base =
           GetPointerBaseWithConstantOffset(UseV, Offset, DL,
                                            /*AllowNonInbounds*/ false)) {
@@ -2949,16 +2961,48 @@ struct AADereferenceableImpl : AADereferenceable {
   const StateType &getState() const override { return *this; }
   /// }
 
+  /// Helper function for collecting accessed bytes in must-be-executed-context
+  void addAccessedBytesForUse(Attributor &A, const Use *U,
+                              const Instruction *I) {
+    const Value *UseV = U->get();
+    if (!UseV->getType()->isPointerTy())
+      return;
+
+    Type *PtrTy = UseV->getType();
+    const DataLayout &DL = A.getDataLayout();
+    int64_t Offset;
+    if (const Value *Base = getBasePointerOfAccessPointerOperand(
+            I, Offset, DL, /*AllowNonInbounds*/ true)) {
+      if (Base == &getAssociatedValue() && getPointerOperand(I) == UseV) {
+        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
+        addAccessedBytes(Offset, Size);
+      }
+    }
+    return;
+  }
+
   /// See AAFromMustBeExecutedContext
   bool followUse(Attributor &A, const Use *U, const Instruction *I) {
     bool IsNonNull = false;
     bool TrackUse = false;
     int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
         A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+
+    addAccessedBytesForUse(A, U, I);
     takeKnownDerefBytesMaximum(DerefBytes);
     return TrackUse;
   }
 
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Change = AADereferenceable::manifest(A);
+    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
+      removeAttrs({Attribute::DereferenceableOrNull});
+      return ChangeStatus::CHANGED;
+    }
+    return Change;
+  }
+
   void getDeducedAttributes(LLVMContext &Ctx,
                             SmallVectorImpl<Attribute> &Attrs) const override {
     // TODO: Add *_globally support
@@ -3119,6 +3163,20 @@ static unsigned int getKnownAlignForUse(Attributor &A,
                                         AbstractAttribute &QueryingAA,
                                         Value &AssociatedValue, const Use *U,
                                         const Instruction *I, bool &TrackUse) {
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into.
+  if (isa<CastInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEP->hasAllConstantIndices()) {
+      TrackUse = true;
+      return 0;
+    }
+  }
+
+  unsigned Alignment = 0;
   if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
     if (ICS.isBundleOperand(U) || ICS.isCallee(U))
       return 0;
@@ -3129,23 +3187,34 @@ static unsigned int getKnownAlignForUse(Attributor &A,
     // dependences here.
     auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
                                         /* TrackDependence */ false);
-    return AlignAA.getKnownAlign();
-  }
-
-  // We need to follow common pointer manipulation uses to the accesses they
-  // feed into.
-  // TODO: Consider gep instruction
-  if (isa<CastInst>(I)) {
-    TrackUse = true;
-    return 0;
+    Alignment = AlignAA.getKnownAlign();
   }
 
+  const Value *UseV = U->get();
   if (auto *SI = dyn_cast<StoreInst>(I))
-    return SI->getAlignment();
+    Alignment = SI->getAlignment();
   else if (auto *LI = dyn_cast<LoadInst>(I))
-    return LI->getAlignment();
+    Alignment = LI->getAlignment();
 
-  return 0;
+  if (Alignment <= 1)
+    return 0;
+
+  auto &DL = A.getDataLayout();
+  int64_t Offset;
+
+  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
+    if (Base == &AssociatedValue) {
+      // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+      // So we can say that the maximum power of two which is a divisor of
+      // gcd(Offset, Alignment) is an alignment.
+
+      uint32_t gcd =
+          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
+      Alignment = llvm::PowerOf2Floor(gcd);
+    }
+  }
+
+  return Alignment;
 }
 struct AAAlignImpl : AAAlign {
   AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {}
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 9db079c7734e2..2bd3df3add7a0 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -328,6 +328,9 @@ Function *HotColdSplitting::extractColdRegion(
     }
     CI->setIsNoInline();
 
+    if (OrigF->hasSection())
+      OutF->setSection(OrigF->getSection());
+
     markFunctionCold(*OutF, BFI != nullptr);
 
     LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 23ca03ff68b0d..92e9a8814f8ff 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2279,6 +2279,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::copysign: {
+    const APFloat *C;
+    if (match(II->getArgOperand(1), m_APFloat(C))) {
+      // If we know the sign bit of the sign argument, reduce to FABS/FNABS:
+      // copysign X, PosC --> fabs X
+      // copysign X, NegC --> fneg (fabs X)
+      Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs,
+                                                 II->getArgOperand(0), II);
+      if (C->isNegative())
+        Fabs = Builder.CreateFNegFMF(Fabs, II);
+
+      return replaceInstUsesWith(*II, Fabs);
+    }
+    break;
+  }
   case Intrinsic::fabs: {
     Value *Cond;
     Constant *LHS, *RHS;
@@ -3314,6 +3329,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (match(Arg, m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg))) &&
         II->getType() == ArgArg->getType())
       return replaceInstUsesWith(*II, ArgArg);
+    Constant *XorMask;
+    if (match(Arg,
+              m_Xor(m_Intrinsic<Intrinsic::arm_mve_pred_v2i>(m_Value(ArgArg)),
+                    m_Constant(XorMask))) &&
+        II->getType() == ArgArg->getType()) {
+      if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
+        if (CI->getValue().trunc(16).isAllOnesValue()) {
+          auto TrueVector = Builder.CreateVectorSplat(
+              II->getType()->getVectorNumElements(), Builder.getTrue());
+          return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
+        }
+      }
+    }
     KnownBits ScalarKnown(32);
     if (SimplifyDemandedBits(II, 0, APInt::getLowBitsSet(32, 16),
                              ScalarKnown, 0))
@@ -3358,7 +3386,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
-      APFloat Val(ArgVal.getSemantics(), 1.0);
+      APFloat Val(ArgVal.getSemantics(), 1);
       APFloat::opStatus Status = Val.divide(ArgVal,
                                             APFloat::rmNearestTiesToEven);
       // Only do this if it was exact and therefore not dependent on the
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 0390368c4bb40..2171c819fd9e2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/KnownBits.h"
+#include <numeric>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -1820,12 +1821,24 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
 }
 
 /// This input value (which is known to have vector type) is being zero extended
-/// or truncated to the specified vector type.
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
+/// A vector is always stored with index 0 at the lowest address, which
+/// corresponds to the most significant bits for a big endian stored integer and
+/// the least significant bits for little endian. A trunc/zext of an integer
+/// impacts the big end of the integer. Thus, we need to add/remove elements at
+/// the front of the vector for big endian targets, and the back of the vector
+/// for little endian targets.
+///
 /// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
 ///
 /// The source and destination vector types may have different element types.
-static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
-                                         InstCombiner &IC) {
+static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
+                                                            VectorType *DestTy,
+                                                            InstCombiner &IC) {
   // We can only do this optimization if the output is a multiple of the input
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
@@ -1844,31 +1857,53 @@ static Instruction *optimizeVectorResize(Value *InVal, VectorType *DestTy,
     InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
+  bool IsBigEndian = IC.getDataLayout().isBigEndian();
+  unsigned SrcElts = SrcTy->getNumElements();
+  unsigned DestElts = DestTy->getNumElements();
+
+  assert(SrcElts != DestElts && "Element counts should be different.");
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
-  SmallVector<uint32_t, 16> ShuffleMask;
+  SmallVector<uint32_t, 16> ShuffleMaskStorage;
+  ArrayRef<uint32_t> ShuffleMask;
   Value *V2;
 
-  if (SrcTy->getNumElements() > DestTy->getNumElements()) {
-    // If we're shrinking the number of elements, just shuffle in the low
-    // elements from the input and use undef as the second shuffle input.
-    V2 = UndefValue::get(SrcTy);
-    for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
-      ShuffleMask.push_back(i);
+  // Produce an identify shuffle mask for the src vector.
+  ShuffleMaskStorage.resize(SrcElts);
+  std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
 
+  if (SrcElts > DestElts) {
+    // If we're shrinking the number of elements (rewriting an integer
+    // truncate), just shuffle in the elements corresponding to the least
+    // significant bits from the input and use undef as the second shuffle
+    // input.
+    V2 = UndefValue::get(SrcTy);
+    // Make sure the shuffle mask selects the "least significant bits" by
+    // keeping elements from back of the src vector for big endian, and from the
+    // front for little endian.
+    ShuffleMask = ShuffleMaskStorage;
+    if (IsBigEndian)
+      ShuffleMask = ShuffleMask.take_back(DestElts);
+    else
+      ShuffleMask = ShuffleMask.take_front(DestElts);
   } else {
-    // If we're increasing the number of elements, shuffle in all of the
-    // elements from InVal and fill the rest of the result elements with zeros
-    // from a constant zero.
+    // If we're increasing the number of elements (rewriting an integer zext),
+    // shuffle in all of the elements from InVal. Fill the rest of the result
+    // elements with zeros from a constant zero.
     V2 = Constant::getNullValue(SrcTy);
-    unsigned SrcElts = SrcTy->getNumElements();
-    for (unsigned i = 0, e = SrcElts; i != e; ++i)
-      ShuffleMask.push_back(i);
-
-    // The excess elements reference the first element of the zero input.
-    for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
-      ShuffleMask.push_back(SrcElts);
+    // Use first elt from V2 when indicating zero in the shuffle mask.
+    uint32_t NullElt = SrcElts;
+    // Extend with null values in the "most significant bits" by adding elements
+    // in front of the src vector for big endian, and at the back for little
+    // endian.
+    unsigned DeltaElts = DestElts - SrcElts;
+    if (IsBigEndian)
+      ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt);
+    else
+      ShuffleMaskStorage.append(DeltaElts, NullElt);
+    ShuffleMask = ShuffleMaskStorage;
   }
 
   return new ShuffleVectorInst(InVal, V2,
@@ -2359,8 +2394,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   }
 
   if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
-    if (DestVTy->getNumElements() == 1 &&
-        VectorType::isValidElementType(SrcTy)) {
+    if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
       Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
       return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
@@ -2375,8 +2409,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
         CastInst *SrcCast = cast<CastInst>(Src);
         if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
           if (isa<VectorType>(BCIn->getOperand(0)->getType()))
-            if (Instruction *I = optimizeVectorResize(BCIn->getOperand(0),
-                                               cast<VectorType>(DestTy), *this))
+            if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts(
+                    BCIn->getOperand(0), cast<VectorType>(DestTy), *this))
               return I;
       }
 
@@ -2392,7 +2426,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (SrcVTy->getNumElements() == 1) {
       // If our destination is not a vector, then make this a straight
       // scalar-scalar cast.
-      if (VectorType::isValidElementType(DestTy)) {
+      if (!DestTy->isVectorTy()) {
         Value *Elem =
           Builder.CreateExtractElement(Src,
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 5fb3ec8757133..071985eb64138 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2566,9 +2566,6 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   Type *Ty = Add->getType();
   CmpInst::Predicate Pred = Cmp.getPredicate();
 
-  if (!Add->hasOneUse())
-    return nullptr;
-
   // If the add does not wrap, we can always adjust the compare by subtracting
   // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
   // are canonicalized to SGT/SLT/UGT/ULT.
@@ -2602,6 +2599,9 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
       return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
   }
 
+  if (!Add->hasOneUse())
+    return nullptr;
+
   // X+C <u C2 -> (X & -C2) == C
   //   iff C & (C2-1) == 0
   //       C2 is a power of 2
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 0b9128a9f5a1c..f7b39d98d4923 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1368,8 +1368,10 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   }
 
   // 1 urem X -> zext(X != 1)
-  if (match(Op0, m_One()))
-    return CastInst::CreateZExtOrBitCast(Builder.CreateICmpNE(Op1, Op0), Ty);
+  if (match(Op0, m_One())) {
+    Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
+    return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+  }
 
   // X urem C -> X < C ? X : X - C, where C >= signbit.
   if (match(Op1, m_Negative())) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index bdfbd75d31a84..05a624fde86b6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -704,16 +704,24 @@ static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
   assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
          "Unexpected isUnsigned predicate!");
 
-  // Account for swapped form of subtraction: ((a > b) ? b - a : 0).
+  // Ensure the sub is of the form:
+  //  (a > b) ? a - b : 0 -> usub.sat(a, b)
+  //  (a > b) ? b - a : 0 -> -usub.sat(a, b)
+  // Checking for both a-b and a+(-b) as a constant.
   bool IsNegative = false;
-  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))))
+  const APInt *C;
+  if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) ||
+      (match(A, m_APInt(C)) &&
+       match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C)))))
     IsNegative = true;
-  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))))
+  else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) &&
+           !(match(B, m_APInt(C)) &&
+             match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C)))))
     return nullptr;
 
-  // If sub is used anywhere else, we wouldn't be able to eliminate it
-  // afterwards.
-  if (!TrueVal->hasOneUse())
+  // If we are adding a negate and the sub and icmp are used anywhere else, we
+  // would end up with more instructions.
+  if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse())
     return nullptr;
 
   // (a > b) ? a - b : 0 -> usub.sat(a, b)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index d31cbc0882ee5..9fabe9def1104 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -435,13 +435,6 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
-
-      // If the input is a bitcast from x86_mmx, turn into a single bitcast from
-      // the mmx type to the scalar type.
-      if (CI->getOpcode() == Instruction::BitCast &&
-          EI.getVectorOperandType()->getNumElements() == 1 &&
-          CI->getOperand(0)->getType()->isX86_MMXTy())
-        return new BitCastInst(CI->getOperand(0), EI.getType());
     }
   }
   return nullptr;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 831fdedfc5e55..c7e708127a41f 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -2996,7 +2996,6 @@ void FunctionStackPoisoner::processStaticAllocas() {
 
   Instruction *InsBefore = AllocaVec[0];
   IRBuilder<> IRB(InsBefore);
-  IRB.SetCurrentDebugLocation(EntryDebugLocation);
 
   // Make sure non-instrumented allocas stay in the entry block. Otherwise,
   // debug info is broken, because only entry-block allocas are treated as
@@ -3091,14 +3090,12 @@ void FunctionStackPoisoner::processStaticAllocas() {
     Instruction *Term =
         SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
     IRBuilder<> IRBIf(Term);
-    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     StackMallocIdx = StackMallocSizeClass(LocalStackSize);
     assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
     Value *FakeStackValue =
         IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
                          ConstantInt::get(IntptrTy, LocalStackSize));
     IRB.SetInsertPoint(InsBefore);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
                           ConstantInt::get(IntptrTy, 0));
 
@@ -3106,14 +3103,11 @@ void FunctionStackPoisoner::processStaticAllocas() {
         IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
     Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
     IRBIf.SetInsertPoint(Term);
-    IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
     Value *AllocaValue =
         DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
 
     IRB.SetInsertPoint(InsBefore);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
-    IRB.SetCurrentDebugLocation(EntryDebugLocation);
     IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
     DIExprFlags |= DIExpression::DerefBefore;
   } else {
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index dbe49cbc03c23..21077a52c154c 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -553,7 +553,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
   unsigned NumUses = 0;
 
   bool OptForSize = Entry->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (!OptForSize || std::distance(S,E) > 100) {
     for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
       NumUses += ConstCand->Uses.size();
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 2c4937b6bef21..6ce2d06058cf3 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -717,22 +717,6 @@ bool LoopInterchangeLegality::findInductionAndReductions(
   return true;
 }
 
-static bool containsSafePHI(BasicBlock *Block, bool isOuterLoopExitBlock) {
-  for (PHINode &PHI : Block->phis()) {
-    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
-    if (PHI.getNumIncomingValues() > 1)
-      return false;
-    Instruction *Ins = dyn_cast<Instruction>(PHI.getIncomingValue(0));
-    if (!Ins)
-      return false;
-    // Incoming value for lcssa phi's in outer loop exit can only be inner loop
-    // exits lcssa phi else it would not be tightly nested.
-    if (!isa<PHINode>(Ins) && isOuterLoopExitBlock)
-      return false;
-  }
-  return true;
-}
-
 // This function indicates the current limitations in the transform as a result
 // of which we do not proceed.
 bool LoopInterchangeLegality::currentLimitations() {
@@ -831,21 +815,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: We only handle LCSSA PHI's corresponding to reduction for now.
-  BasicBlock *InnerExit = InnerLoop->getExitBlock();
-  if (!containsSafePHI(InnerExit, false)) {
-    LLVM_DEBUG(
-        dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoLCSSAPHIOuterInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with LCSSA PHIs can be interchange "
-                "currently.";
-    });
-    return true;
-  }
-
   // TODO: Current limitation: Since we split the inner loop latch at the point
   // were induction variable is incremented (induction.next); We cannot have
   // more than 1 user of induction.next since it would result in broken code
@@ -921,6 +890,28 @@ bool LoopInterchangeLegality::currentLimitations() {
   return false;
 }
 
+// We currently only support LCSSA PHI nodes in the inner loop exit, if their
+// users are either reduction PHIs or PHIs outside the outer loop (which means
+// the we are only interested in the final value after the loop).
+static bool
+areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
+                              SmallPtrSetImpl<PHINode *> &Reductions) {
+  BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
+  for (PHINode &PHI : InnerExit->phis()) {
+    // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+    if (PHI.getNumIncomingValues() > 1)
+      return false;
+    if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
+          PHINode *PN = dyn_cast<PHINode>(U);
+          return !PN || (Reductions.find(PN) == Reductions.end() &&
+                         OuterL->contains(PN->getParent()));
+        })) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // We currently support LCSSA PHI nodes in the outer loop exit, if their
 // incoming values do not come from the outer loop latch or if the
 // outer loop latch has a single predecessor. In that case, the value will
@@ -928,7 +919,7 @@ bool LoopInterchangeLegality::currentLimitations() {
 // will still be true after interchanging. If we have multiple predecessor,
 // that may not be the case, e.g. because the outer loop latch may be executed
 // if the inner loop is not executed.
-static bool areLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
   BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
   for (PHINode &PHI : LoopNestExit->phis()) {
     //  FIXME: We currently are not able to detect floating point reductions
@@ -1013,7 +1004,19 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     return false;
   }
 
-  if (!areLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+  if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
+                                     OuterInnerReductions)) {
+    LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+                                      InnerLoop->getStartLoc(),
+                                      InnerLoop->getHeader())
+             << "Found unsupported PHI node in loop exit.";
+    });
+    return false;
+  }
+
+  if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 5b822b6b81807..598a85e5b9471 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -545,7 +545,8 @@ class LoadEliminationForLoop {
       auto *HeaderBB = L->getHeader();
       auto *F = HeaderBB->getParent();
       bool OptForSize = F->hasOptSize() ||
-                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI);
+                        llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
+                                                    PGSOQueryType::IRPass);
       if (OptForSize) {
         LLVM_DEBUG(
             dbgs() << "Versioning is needed but not allowed when optimizing "
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index bb314310cfa56..4c2b079c6bb5b 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -213,7 +213,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
 
   // Apply size attributes
   bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (OptForSize) {
     UP.Threshold = UP.OptSizeThreshold;
     UP.PartialThreshold = UP.PartialOptSizeThreshold;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 13e44765985f4..d441c6bbf124b 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -331,6 +331,20 @@ static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
   }
 }
 
+// Return the top-most loop containing ExitBB and having ExitBB as exiting block
+// or the loop containing ExitBB, if there is no parent loop containing ExitBB
+// as exiting block.
+static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
+  Loop *TopMost = LI.getLoopFor(ExitBB);
+  Loop *Current = TopMost;
+  while (Current) {
+    if (Current->isLoopExiting(ExitBB))
+      TopMost = Current;
+    Current = Current->getParentLoop();
+  }
+  return TopMost;
+}
+
 /// Unswitch a trivial branch if the condition is loop invariant.
 ///
 /// This routine should only be called when loop code leading to the branch has
@@ -415,9 +429,10 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
   });
 
   // If we have scalar evolutions, we need to invalidate them including this
-  // loop and the loop containing the exit block.
+  // loop, the loop containing the exit block and the topmost parent loop
+  // exiting via LoopExitBB.
   if (SE) {
-    if (Loop *ExitL = LI.getLoopFor(LoopExitBB))
+    if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
       SE->forgetLoop(ExitL);
     else
       // Forget the entire nest as this exits the entire nest.
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 71aa585dfe5d6..26d48ee0d23fa 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -258,7 +258,7 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
         // references in IR module (not in combined index), so we can
         // ignore them when computing import. We do not export references
         // of writeonly object. See computeImportForReferencedGlobals
-        if (ImportIndex.isWriteOnly(GVS) && GVS->refs().size())
+        if (ImportIndex.isWriteOnly(GVS))
           V->setInitializer(Constant::getNullValue(V->getValueType()));
       }
     }
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 18a17119b47fd..44513b1f68275 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1119,6 +1119,45 @@ Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
   return CI->getArgOperand(0);
 }
 
+Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) {
+  Value *Dst = CI->getArgOperand(0);
+  Value *Src = CI->getArgOperand(1);
+  ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  StringRef SrcStr;
+  if (CI->use_empty() && Dst == Src)
+    return Dst;
+  // memccpy(d, s, c, 0) -> nullptr
+  if (N) {
+    if (N->isNullValue())
+      return Constant::getNullValue(CI->getType());
+    if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
+                               /*TrimAtNul=*/false) ||
+        !StopChar)
+      return nullptr;
+  } else {
+    return nullptr;
+  }
+
+  // Wrap arg 'c' of type int to char
+  size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
+  if (Pos == StringRef::npos) {
+    if (N->getZExtValue() <= SrcStr.size()) {
+      B.CreateMemCpy(Dst, 1, Src, 1, CI->getArgOperand(3));
+      return Constant::getNullValue(CI->getType());
+    }
+    return nullptr;
+  }
+
+  Value *NewN =
+      ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
+  // memccpy -> llvm.memcpy
+  B.CreateMemCpy(Dst, 1, Src, 1, NewN);
+  return Pos + 1 <= N->getZExtValue()
+             ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
+             : Constant::getNullValue(CI->getType());
+}
+
 Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) {
   Value *Dst = CI->getArgOperand(0);
   Value *N = CI->getArgOperand(2);
@@ -1696,7 +1735,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
     // TODO: This whole transformation should be backend specific (e.g. some
     //       backends might prefer libcalls or the limit for the exponent might
     //       be different) and it should also consider optimizing for size.
-    APFloat LimF(ExpoF->getSemantics(), 33.0),
+    APFloat LimF(ExpoF->getSemantics(), 33),
             ExpoA(abs(*ExpoF));
     if (ExpoA.compare(LimF) == APFloat::cmpLessThan) {
       // This transformation applies to integer or integer+0.5 exponents only.
@@ -2716,7 +2755,8 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
   // requires more arguments and thus extra MOVs are required.
   bool OptForSize = CI->getFunction()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
+                    llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
   if (OptForSize)
     return nullptr;
 
@@ -2864,6 +2904,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
       return optimizeMemCmp(CI, Builder);
     case LibFunc_memcpy:
       return optimizeMemCpy(CI, Builder);
+    case LibFunc_memccpy:
+      return optimizeMemCCpy(CI, Builder);
     case LibFunc_mempcpy:
       return optimizeMemPCpy(CI, Builder);
     case LibFunc_memmove:
diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
index f1200471cb4f3..cab375225e892 100644
--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -28,6 +28,11 @@ cl::opt<bool> PGSOColdCodeOnly(
     cl::desc("Apply the profile guided size optimizations only "
              "to cold code."));
 
+cl::opt<bool> PGSOIRPassOrTestOnly(
+    "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(true),
+    cl::desc("Apply the profile guided size optimizations only"
+             "to the IR passes or tests."));
+
 cl::opt<bool> ForcePGSO(
     "force-pgso", cl::Hidden, cl::init(false),
     cl::desc("Force the (profiled-guided) size optimizations. "));
@@ -70,11 +75,15 @@ struct BasicBlockBFIAdapter {
 } // end anonymous namespace
 
 bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
-  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI);
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI,
+                                                             QueryType);
 }
 
 bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
-                                 BlockFrequencyInfo *BFI) {
-  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI);
+                                 BlockFrequencyInfo *BFI,
+                                 PGSOQueryType QueryType) {
+  return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
+                                                         QueryType);
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fcd8b05b88301..f614c3a29e558 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4668,14 +4668,26 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   SetVector<Instruction *> Worklist;
   BasicBlock *Latch = TheLoop->getLoopLatch();
 
+  // Instructions that are scalar with predication must not be considered
+  // uniform after vectorization, because that would create an erroneous
+  // replicating region where only a single instance out of VF should be formed.
+  // TODO: optimize such seldom cases if found important, see PR40816.
+  auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+    if (isScalarWithPredication(I, VF)) {
+      LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+                        << *I << "\n");
+      return;
+    }
+    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+    Worklist.insert(I);
+  };
+
   // Start with the conditional branch. If the branch condition is an
   // instruction contained in the loop that is only used by the branch, it is
   // uniform.
   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
-  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse()) {
-    Worklist.insert(Cmp);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Cmp << "\n");
-  }
+  if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+    addToWorklistIfAllowed(Cmp);
 
   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
   // are pointers that are treated like consecutive pointers during
@@ -4734,10 +4746,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // Add to the Worklist all consecutive and consecutive-like pointers that
   // aren't also identified as possibly non-uniform.
   for (auto *V : ConsecutiveLikePtrs)
-    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end()) {
-      LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *V << "\n");
-      Worklist.insert(V);
-    }
+    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+      addToWorklistIfAllowed(V);
 
   // Expand Worklist in topological order: whenever a new instruction
   // is added , its users should be already inside Worklist.  It ensures
@@ -4763,10 +4773,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
             return Worklist.count(J) ||
                    (OI == getLoadStorePointerOperand(J) &&
                     isUniformDecision(J, VF));
-          })) {
-        Worklist.insert(OI);
-        LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
-      }
+          }))
+        addToWorklistIfAllowed(OI);
     }
   }
 
@@ -4808,11 +4816,8 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
       continue;
 
     // The induction variable and its update instruction will remain uniform.
-    Worklist.insert(Ind);
-    Worklist.insert(IndUpdate);
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *Ind << "\n");
-    LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate
-                      << "\n");
+    addToWorklistIfAllowed(Ind);
+    addToWorklistIfAllowed(IndUpdate);
   }
 
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
@@ -7434,7 +7439,8 @@ getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
 
   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
       (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
+       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                   PGSOQueryType::IRPass)))
     SEL = CM_ScalarEpilogueNotAllowedOptSize;
   else if (PreferPredicateOverEpilog ||
            Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e4f50a8787df1..949988415a44c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -629,9 +629,10 @@ class BoUpSLP {
     return MinVecRegSize;
   }
 
-  /// Check if ArrayType or StructType is isomorphic to some VectorType.
-  /// Accepts homogeneous aggregate of vectors like
-  /// { <2 x float>, <2 x float> }
+  /// Check if homogeneous aggregate is isomorphic to some VectorType.
+  /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
+  /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
+  /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
   ///
   /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
   unsigned canMapToVector(Type *T, const DataLayout &DL) const;
@@ -3088,20 +3089,22 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
 }
 
 unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
-  unsigned N;
-  Type *EltTy;
-  auto *ST = dyn_cast<StructType>(T);
-  if (ST) {
-    N = ST->getNumElements();
-    EltTy = *ST->element_begin();
-  } else {
-    N = cast<ArrayType>(T)->getNumElements();
-    EltTy = cast<ArrayType>(T)->getElementType();
-  }
-
-  if (auto *VT = dyn_cast<VectorType>(EltTy)) {
-    EltTy = VT->getElementType();
-    N *= VT->getNumElements();
+  unsigned N = 1;
+  Type *EltTy = T;
+
+  while (isa<CompositeType>(EltTy)) {
+    if (auto *ST = dyn_cast<StructType>(EltTy)) {
+      // Check that struct is homogeneous.
+      for (const auto *Ty : ST->elements())
+        if (Ty != *ST->element_begin())
+          return 0;
+      N *= ST->getNumElements();
+      EltTy = *ST->element_begin();
+    } else {
+      auto *SeqT = cast<SequentialType>(EltTy);
+      N *= SeqT->getNumElements();
+      EltTy = SeqT->getElementType();
+    }
   }
 
   if (!isValidElementType(EltTy))
@@ -3109,12 +3112,6 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
     return 0;
-  if (ST) {
-    // Check that struct is homogeneous.
-    for (const auto *Ty : ST->elements())
-      if (Ty != *ST->element_begin())
-        return 0;
-  }
   return N;
 }
 
@@ -6940,57 +6937,54 @@ class HorizontalReduction {
 ///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
 ///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
 ///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
-///  starting from the last insertelement instruction.
+///  starting from the last insertelement or insertvalue instruction.
 ///
-/// Returns true if it matches
-static bool findBuildVector(InsertElementInst *LastInsertElem,
-                            TargetTransformInfo *TTI,
-                            SmallVectorImpl<Value *> &BuildVectorOpds,
-                            int &UserCost) {
-  UserCost = 0;
-  Value *V = nullptr;
-  do {
-    if (auto *CI = dyn_cast<ConstantInt>(LastInsertElem->getOperand(2))) {
-      UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                          LastInsertElem->getType(),
-                                          CI->getZExtValue());
-    }
-    BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
-    V = LastInsertElem->getOperand(0);
-    if (isa<UndefValue>(V))
-      break;
-    LastInsertElem = dyn_cast<InsertElementInst>(V);
-    if (!LastInsertElem || !LastInsertElem->hasOneUse())
-      return false;
-  } while (true);
-  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
-  return true;
-}
-
-/// Like findBuildVector, but looks for construction of aggregate.
-/// Accepts homegeneous aggregate of vectors like { <2 x float>, <2 x float> }.
+/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
+///
+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
 ///
 /// \return true if it matches.
-static bool findBuildAggregate(InsertValueInst *IV, TargetTransformInfo *TTI,
+static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
                                int &UserCost) {
+  assert((isa<InsertElementInst>(LastInsertInst) ||
+          isa<InsertValueInst>(LastInsertInst)) &&
+         "Expected insertelement or insertvalue instruction!");
   UserCost = 0;
   do {
-    if (auto *IE = dyn_cast<InsertElementInst>(IV->getInsertedValueOperand())) {
+    Value *InsertedOperand;
+    if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+      InsertedOperand = IE->getOperand(1);
+      LastInsertInst = IE->getOperand(0);
+      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+        UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
+                                            IE->getType(), CI->getZExtValue());
+      }
+    } else {
+      auto *IV = cast<InsertValueInst>(LastInsertInst);
+      InsertedOperand = IV->getInsertedValueOperand();
+      LastInsertInst = IV->getAggregateOperand();
+    }
+    if (isa<InsertElementInst>(InsertedOperand) ||
+        isa<InsertValueInst>(InsertedOperand)) {
       int TmpUserCost;
-      SmallVector<Value *, 4> TmpBuildVectorOpds;
-      if (!findBuildVector(IE, TTI, TmpBuildVectorOpds, TmpUserCost))
+      SmallVector<Value *, 8> TmpBuildVectorOpds;
+      if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
+                              TmpUserCost))
         return false;
-      BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(), TmpBuildVectorOpds.rend());
+      BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
+                             TmpBuildVectorOpds.rend());
       UserCost += TmpUserCost;
     } else {
-      BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+      BuildVectorOpds.push_back(InsertedOperand);
     }
-    Value *V = IV->getAggregateOperand();
-    if (isa<UndefValue>(V))
+    if (isa<UndefValue>(LastInsertInst))
       break;
-    IV = dyn_cast<InsertValueInst>(V);
-    if (!IV || !IV->hasOneUse())
+    if ((!isa<InsertValueInst>(LastInsertInst) &&
+         !isa<InsertElementInst>(LastInsertInst)) ||
+        !LastInsertInst->hasOneUse())
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
@@ -7177,7 +7171,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
   int UserCost;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildVector(IEI, TTI, BuildVectorOpds, UserCost) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 123477cd62096..b15c5d0f7dad1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -468,6 +468,11 @@ void VPlan::execute(VPTransformState *State) {
     updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void VPlan::dump() const { dbgs() << *this << '\n'; }
+#endif
+
 void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
                                 BasicBlock *LoopLatchBB) {
   BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
@@ -527,8 +532,7 @@ void VPlanPrinter::dump() {
   if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
     OS << ", where:";
     if (Plan.BackedgeTakenCount)
-      OS << "\\n"
-         << *Plan.getOrCreateBackedgeTakenCount() << " := BackedgeTakenCount";
+      OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount";
     for (auto Entry : Plan.Value2VPValue) {
       OS << "\\n" << *Entry.second;
       OS << DOT::EscapeString(" := ");
@@ -540,7 +544,7 @@ void VPlanPrinter::dump() {
   OS << "edge [fontname=Courier, fontsize=30]\n";
   OS << "compound=true\n";
 
-  for (VPBlockBase *Block : depth_first(Plan.getEntry()))
+  for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
     dumpBlock(Block);
 
   OS << "}\n";
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 226c6c0279d7e..6fabd5c39ba5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1159,6 +1159,128 @@ class VPRegionBlock : public VPBlockBase {
   void execute(struct VPTransformState *State) override;
 };
 
+//===----------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     //
+//===----------------------------------------------------------------------===//
+
+// The following set of template specializations implement GraphTraits to treat
+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
+// VPBlockBase is a VPRegionBlock, this specialization provides access to its
+// successors/predecessors but not to the blocks inside the region.
+
+template <> struct GraphTraits<VPBlockBase *> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+  using NodeRef = const VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getSuccessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getSuccessors().end();
+  }
+};
+
+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
+// of successors for the inverse traversal.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return N->getPredecessors().begin();
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return N->getPredecessors().end();
+  }
+};
+
+// The following set of template specializations implement GraphTraits to
+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
+// there won't be automatic recursion into other VPBlockBases that turn to be
+// VPRegionBlocks.
+
+template <>
+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<const VPRegionBlock *>
+    : public GraphTraits<const VPBlockBase *> {
+  using GraphRef = const VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getEntry());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<Inverse<VPRegionBlock *>>
+    : public GraphTraits<Inverse<VPBlockBase *>> {
+  using GraphRef = VPRegionBlock *;
+  using nodes_iterator = df_iterator<NodeRef>;
+
+  static NodeRef getEntryNode(Inverse<GraphRef> N) {
+    return N.Graph->getExit();
+  }
+
+  static nodes_iterator nodes_begin(GraphRef N) {
+    return nodes_iterator::begin(N->getExit());
+  }
+
+  static nodes_iterator nodes_end(GraphRef N) {
+    // df_iterator::end() returns an empty iterator so the node used doesn't
+    // matter.
+    return nodes_iterator::end(N);
+  }
+};
+
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -1265,6 +1387,9 @@ class VPlan {
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
   const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
 
+  /// Dump the plan to stderr (for debugging).
+  void dump() const;
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
@@ -1276,20 +1401,20 @@ class VPlan {
 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
 /// indented and follows the dot format.
 class VPlanPrinter {
-  friend inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan);
+  friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
   friend inline raw_ostream &operator<<(raw_ostream &OS,
                                         const struct VPlanIngredient &I);
 
 private:
   raw_ostream &OS;
-  VPlan &Plan;
+  const VPlan &Plan;
   unsigned Depth = 0;
   unsigned TabWidth = 2;
   std::string Indent;
   unsigned BID = 0;
   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
 
-  VPlanPrinter(raw_ostream &O, VPlan &P) : OS(O), Plan(P) {}
+  VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {}
 
   /// Handle indentation.
   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
@@ -1336,134 +1461,12 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
   return OS;
 }
 
-inline raw_ostream &operator<<(raw_ostream &OS, VPlan &Plan) {
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
   VPlanPrinter Printer(OS, Plan);
   Printer.dump();
   return OS;
 }
 
-//===----------------------------------------------------------------------===//
-// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs     //
-//===----------------------------------------------------------------------===//
-
-// The following set of template specializations implement GraphTraits to treat
-// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
-// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
-// VPBlockBase is a VPRegionBlock, this specialization provides access to its
-// successors/predecessors but not to the blocks inside the region.
-
-template <> struct GraphTraits<VPBlockBase *> {
-  using NodeRef = VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getSuccessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getSuccessors().end();
-  }
-};
-
-template <> struct GraphTraits<const VPBlockBase *> {
-  using NodeRef = const VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
-
-  static NodeRef getEntryNode(NodeRef N) { return N; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getSuccessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getSuccessors().end();
-  }
-};
-
-// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
-// of successors for the inverse traversal.
-template <> struct GraphTraits<Inverse<VPBlockBase *>> {
-  using NodeRef = VPBlockBase *;
-  using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
-  static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
-
-  static inline ChildIteratorType child_begin(NodeRef N) {
-    return N->getPredecessors().begin();
-  }
-
-  static inline ChildIteratorType child_end(NodeRef N) {
-    return N->getPredecessors().end();
-  }
-};
-
-// The following set of template specializations implement GraphTraits to
-// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
-// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
-// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
-// there won't be automatic recursion into other VPBlockBases that turn to be
-// VPRegionBlocks.
-
-template <>
-struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
-  using GraphRef = VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getEntry());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
-template <>
-struct GraphTraits<const VPRegionBlock *>
-    : public GraphTraits<const VPBlockBase *> {
-  using GraphRef = const VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getEntry());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
-template <>
-struct GraphTraits<Inverse<VPRegionBlock *>>
-    : public GraphTraits<Inverse<VPBlockBase *>> {
-  using GraphRef = VPRegionBlock *;
-  using nodes_iterator = df_iterator<NodeRef>;
-
-  static NodeRef getEntryNode(Inverse<GraphRef> N) {
-    return N.Graph->getExit();
-  }
-
-  static nodes_iterator nodes_begin(GraphRef N) {
-    return nodes_iterator::begin(N->getExit());
-  }
-
-  static nodes_iterator nodes_end(GraphRef N) {
-    // df_iterator::end() returns an empty iterator so the node used doesn't
-    // matter.
-    return nodes_iterator::end(N);
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // VPlan Utilities
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/WindowsManifest/CMakeLists.txt b/llvm/lib/WindowsManifest/CMakeLists.txt
index 4f2d011d54348..fe6ddcd414d56 100644
--- a/llvm/lib/WindowsManifest/CMakeLists.txt
+++ b/llvm/lib/WindowsManifest/CMakeLists.txt
@@ -1,18 +1,12 @@
-set(system_libs)
-if( CMAKE_HOST_UNIX )
-  if( LLVM_LIBXML2_ENABLED )
-    set(system_libs ${system_libs} ${LIBXML2_LIBS})
-  endif()
-endif()
-
 add_llvm_component_library(LLVMWindowsManifest
   WindowsManifestMerger.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/WindowsManifest
-  ${Backtrace_INCLUDE_DIRS}
+  ${Backtrace_INCLUDE_DIRS})
 
-  LINK_LIBS ${system_libs}
-  )
-
-set_property(TARGET LLVMWindowsManifest PROPERTY LLVM_SYSTEM_LIBS "${system_libs}")
+if(LIBXML2_LIBRARIES)
+  target_link_libraries(LLVMWindowsManifest PUBLIC ${LIBXML2_LIBRARIES})
+  set_property(TARGET LLVMWindowsManifest PROPERTY
+    LLVM_SYSTEM_LIBS ${LIBXML2_LIBRARIES})
+endif()
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/basic.ll b/llvm/test/Analysis/BranchProbabilityInfo/basic.ll
index 64e0a82456f11..8212cc4769045 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -141,6 +141,24 @@ exit:
   ret i32 %result
 }
 
+define i32 @test_cold_loop(i32 %a, i32 %b) {
+entry:
+  %cond1 = icmp eq i32 %a, 42
+  br i1 %cond1, label %header, label %exit
+
+header:
+  br label %body
+
+body:
+  %cond2 = icmp eq i32 %b, 42
+  br i1 %cond2, label %header, label %exit
+; CHECK: edge body -> header probability is 0x40000000 / 0x80000000 = 50.00%
+
+exit:
+  call void @coldfunc()
+  ret i32 %b
+}
+
 declare i32 @regular_function(i32 %i)
 
 define i32 @test_cold_call_sites_with_prof(i32 %a, i32 %b, i1 %flag, i1 %flag2) {
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/noreturn.ll b/llvm/test/Analysis/BranchProbabilityInfo/noreturn.ll
index 0566ca16c2f3a..6e01afd2cfc82 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/noreturn.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/noreturn.ll
@@ -79,6 +79,32 @@ exit:
   ret i32 %b
 }
 
+define i32 @test4(i32 %a, i32 %b) {
+; CHECK: Printing analysis {{.*}} for function 'test4'
+; Make sure we handle loops post-dominated by unreachables.
+entry:
+  %cond1 = icmp eq i32 %a, 42
+  br i1 %cond1, label %header, label %exit
+; CHECK: edge entry -> header probability is 0x00000001 / 0x80000000 = 0.00%
+; CHECK: edge entry -> exit probability is 0x7fffffff / 0x80000000 = 100.00% [HOT edge]
+
+header:
+  br label %body
+
+body:
+  %cond2 = icmp eq i32 %a, 42
+  br i1 %cond2, label %header, label %abort
+; CHECK: edge body -> header probability is 0x40000000 / 0x80000000 = 50.00%
+; CHECK: edge body -> abort probability is 0x40000000 / 0x80000000 = 50.00%
+
+abort:
+  call void @abort() noreturn
+  unreachable
+
+exit:
+  ret i32 %b
+}
+
 @_ZTIi = external global i8*
 
 ; CHECK-LABEL: throwSmallException
diff --git a/llvm/test/Analysis/ConstantFolding/copysign.ll b/llvm/test/Analysis/ConstantFolding/copysign.ll
new file mode 100644
index 0000000000000..228ffcb470538
--- /dev/null
+++ b/llvm/test/Analysis/ConstantFolding/copysign.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -constprop < %s | FileCheck %s
+
+declare float @llvm.copysign.f32(float, float)
+declare double @llvm.copysign.f64(double, double)
+
+define float @f32_01() {
+; CHECK-LABEL: @f32_01(
+; CHECK-NEXT:    ret float -1.000000e+00
+;
+  %x = call float @llvm.copysign.f32(float 1.0, float -2.0)
+  ret float %x
+}
+
+define float @f32_02() {
+; CHECK-LABEL: @f32_02(
+; CHECK-NEXT:    ret float 2.000000e+00
+;
+  %x = call float @llvm.copysign.f32(float -2.0, float 1.0)
+  ret float %x
+}
+
+define float @f32_03() {
+; CHECK-LABEL: @f32_03(
+; CHECK-NEXT:    ret float -2.000000e+00
+;
+  %x = call float @llvm.copysign.f32(float -2.0, float -1.0)
+  ret float %x
+}
+
+define double @f64_01() {
+; CHECK-LABEL: @f64_01(
+; CHECK-NEXT:    ret double -1.000000e+00
+;
+  %x = call double @llvm.copysign.f64(double 1.0, double -2.0)
+  ret double %x
+}
+
+define double @f64_02() {
+; CHECK-LABEL: @f64_02(
+; CHECK-NEXT:    ret double 1.000000e+00
+;
+  %x = call double @llvm.copysign.f64(double -1.0, double 2.0)
+  ret double %x
+}
+
+define double @f64_03() {
+; CHECK-LABEL: @f64_03(
+; CHECK-NEXT:    ret double -1.000000e+00
+;
+  %x = call double @llvm.copysign.f64(double -1.0, double -2.0)
+  ret double %x
+}
diff --git a/llvm/test/Analysis/CostModel/PowerPC/future-cost-model.ll b/llvm/test/Analysis/CostModel/PowerPC/future-cost-model.ll
new file mode 100644
index 0000000000000..3e4fb82e600c6
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/PowerPC/future-cost-model.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:       -mcpu=future | FileCheck %s --check-prefix=FUTURE
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:       -mcpu=pwr9 | FileCheck %s --check-prefix=PWR9
+
+define void @test(i16 %p1, i16 %p2, <4 x i16> %p3, <4 x i16> %p4) {
+  %i1 = add i16 %p1, %p2
+  %v1 = add <4 x i16> %p3, %p4
+  ret void
+  ; FUTURE: cost of 1 {{.*}} add
+  ; FUTURE: cost of 1 {{.*}} add
+
+  ; PWR9: cost of 1 {{.*}} add
+  ; PWR9: cost of 2 {{.*}} add
+}
+
diff --git a/llvm/test/Analysis/CostModel/X86/fptosi.ll b/llvm/test/Analysis/CostModel/X86/fptosi.ll
index 7583d6e60c809..bb03b56e48f60 100644
--- a/llvm/test/Analysis/CostModel/X86/fptosi.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptosi.ll
@@ -6,7 +6,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
@@ -39,6 +39,13 @@ define i32 @fptosi_double_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptosi <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptosi <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi double undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x double> undef to <2 x i64>
@@ -75,6 +82,13 @@ define i32 @fptosi_double_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = fptosi <4 x double> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V8I32 = fptosi <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi double undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = fptosi <2 x double> undef to <2 x i32>
@@ -111,6 +125,13 @@ define i32 @fptosi_double_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I16 = fptosi <4 x double> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I16 = fptosi <8 x double> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi double undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I16 = fptosi <2 x double> undef to <2 x i16>
@@ -147,6 +168,13 @@ define i32 @fptosi_double_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_double_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_double_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
@@ -194,6 +222,14 @@ define i32 @fptosi_float_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V4I64 = fptosi <4 x float> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 75 for instruction: %V8I64 = fptosi <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %V16I64 = fptosi <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = fptosi float undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = fptosi <2 x float> undef to <2 x i64>
@@ -218,6 +254,13 @@ define i32 @fptosi_float_i32(i32 %arg) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptosi <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptosi <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptosi float undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = fptosi <4 x float> undef to <4 x i32>
@@ -254,6 +297,13 @@ define i32 @fptosi_float_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = fptosi <8 x float> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I16 = fptosi <16 x float> undef to <16 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptosi float undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = fptosi <4 x float> undef to <4 x i16>
@@ -290,6 +340,13 @@ define i32 @fptosi_float_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptosi_float_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptosi_float_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
diff --git a/llvm/test/Analysis/CostModel/X86/fptoui.ll b/llvm/test/Analysis/CostModel/X86/fptoui.ll
index 078b21ba72033..cdb3e5486604f 100644
--- a/llvm/test/Analysis/CostModel/X86/fptoui.ll
+++ b/llvm/test/Analysis/CostModel/X86/fptoui.ll
@@ -6,7 +6,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512DQ
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-darwin -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
@@ -39,6 +39,13 @@ define i32 @fptoui_double_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = fptoui <4 x double> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V8I64 = fptoui <8 x double> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui double undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x double> undef to <2 x i64>
@@ -75,6 +82,13 @@ define i32 @fptoui_double_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I32 = fptoui <4 x double> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I32 = fptoui <8 x double> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui double undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I32 = fptoui <2 x double> undef to <2 x i32>
@@ -111,6 +125,13 @@ define i32 @fptoui_double_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I16 = fptoui <4 x double> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I16 = fptoui <8 x double> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui double undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I16 = fptoui <2 x double> undef to <2 x i16>
@@ -147,6 +168,13 @@ define i32 @fptoui_double_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_double_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_double_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
@@ -194,6 +222,14 @@ define i32 @fptoui_float_i64(i32 %arg) {
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = fptoui <4 x float> undef to <4 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V8I64 = fptoui <8 x float> undef to <8 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 199 for instruction: %V16I64 = fptoui <16 x float> undef to <16 x i64>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = fptoui float undef to i64
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V2I64 = fptoui <2 x float> undef to <2 x i64>
@@ -232,6 +268,13 @@ define i32 @fptoui_float_i32(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I32 = fptoui <8 x float> undef to <8 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I32 = fptoui <16 x float> undef to <16 x i32>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = fptoui float undef to i32
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I32 = fptoui <4 x float> undef to <4 x i32>
@@ -268,6 +311,13 @@ define i32 @fptoui_float_i16(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I16 = fptoui <8 x float> undef to <8 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I16 = fptoui <16 x float> undef to <16 x i16>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = fptoui float undef to i16
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = fptoui <4 x float> undef to <4 x i16>
@@ -304,6 +354,13 @@ define i32 @fptoui_float_i8(i32 %arg) {
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'fptoui_float_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'fptoui_float_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
index 3ceba32744b6b..4ed509ff9db09 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-extract_subvector.ll
@@ -8,8 +8,8 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,GLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
 ;
@@ -270,64 +270,123 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512
 }
 
 define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi16'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
@@ -506,6 +565,124 @@ define void @test_vXi16(<4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256,
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'test_vXi16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'test_vXi16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_2345 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_6789 = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19 = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_02_03_04_05 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_06_07_08_09 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i16> %src512, <32 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
 ; BTVER2-LABEL: 'test_vXi16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <4 x i16> %src64, <4 x i16> undef, <2 x i32> <i32 2, i32 3>
@@ -863,125 +1040,6 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; SSE42-LABEL: 'test_vXi8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
 ; AVX-LABEL: 'test_vXi8'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
@@ -1339,6 +1397,244 @@ define void @test_vXi8(<8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <6
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+; SLM-LABEL: 'test_vXi8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; GLM-LABEL: 'test_vXi8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_45 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_67 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_0123 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_4567 = shufflevector <8 x i8> %src64, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_45 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89 = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_89ABCDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19 = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17 = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 0, i32 1>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 4, i32 5>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 8, i32 9>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 12, i32 13>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 16, i32 17>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 20, i32 21>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 24, i32 25>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 28, i32 29>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 32, i32 33>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 34, i32 35>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 36, i32 37>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 40, i32 41>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 42, i32 43>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 44, i32 45>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 48, i32 49>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 50, i32 51>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 52, i32 53>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39 = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 56, i32 57>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 58, i32 59>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 60, i32 61>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <2 x i32> <i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 32, i32 33, i32 34, i32 35>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 36, i32 37, i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 40, i32 41, i32 42, i32 43>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 48, i32 49, i32 50, i32 51>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 52, i32 53, i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 56, i32 57, i32 58, i32 59>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <4 x i32> <i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37 = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <8 x i32> <i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <16 x i32> <i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_00_01_02_03_04_05_06_07_08_09_0A_0B_0C_0D_0E_0F_10_11_12_13_14_15_16_17_18_19_1A_1B_1C_1D_1E_1F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_20_21_22_23_24_25_26_27_28_29_2A_2B_2C_2D_2E_2F_30_31_32_33_34_35_36_37_38_39_3A_3B_3C_3D_3E_3F = shufflevector <64 x i8> %src512, <64 x i8> undef, <32 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
 ; BTVER2-LABEL: 'test_vXi8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64_01 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 0, i32 1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64_23 = shufflevector <8 x i8> %src64, <8 x i8> undef, <2 x i32> <i32 2, i32 3>
diff --git a/llvm/test/Analysis/CostModel/X86/vector-extract.ll b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
index 62123c422a8f5..ddb3654fbc6ab 100644
--- a/llvm/test/Analysis/CostModel/X86/vector-extract.ll
+++ b/llvm/test/Analysis/CostModel/X86/vector-extract.ll
@@ -9,8 +9,8 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,SLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42,GLM
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2
 
 define i32 @extract_double(i32 %arg) {
@@ -188,19 +188,117 @@ define i32 @extract_float(i32 %arg) {
 }
 
 define i32 @extract_i64(i32 %arg) {
-; CHECK-LABEL: 'extract_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i64'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i64'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = extractelement <2 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = extractelement <2 x i64> undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_a = extractelement <4 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = extractelement <4 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = extractelement <4 x i64> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_a = extractelement <8 x i64> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = extractelement <8 x i64> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = extractelement <8 x i64> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = extractelement <8 x i64> undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = extractelement <8 x i64> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i64'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_a = extractelement <2 x i64> undef, i32 %arg
@@ -234,24 +332,157 @@ define i32 @extract_i64(i32 %arg) {
 }
 
 define i32 @extract_i32(i32 %arg) {
-; CHECK-LABEL: 'extract_i32'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i32'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i32'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = extractelement <2 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = extractelement <2 x i32> undef, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_a = extractelement <4 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = extractelement <4 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = extractelement <4 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_a = extractelement <8 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = extractelement <8 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = extractelement <8 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = extractelement <8 x i32> undef, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = extractelement <8 x i32> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_a = extractelement <16 x i32> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = extractelement <16 x i32> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = extractelement <16 x i32> undef, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = extractelement <16 x i32> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = extractelement <16 x i32> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i32'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_a = extractelement <2 x i32> undef, i32 %arg
@@ -296,24 +527,157 @@ define i32 @extract_i32(i32 %arg) {
 }
 
 define i32 @extract_i16(i32 %arg) {
-; CHECK-LABEL: 'extract_i16'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i16'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i16'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i16'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i16'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i16'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i16'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = extractelement <8 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = extractelement <8 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_a = extractelement <16 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = extractelement <16 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = extractelement <16 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = extractelement <16 x i16> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = extractelement <16 x i16> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_a = extractelement <32 x i16> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = extractelement <32 x i16> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = extractelement <32 x i16> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = extractelement <32 x i16> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = extractelement <32 x i16> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = extractelement <32 x i16> undef, i32 16
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = extractelement <32 x i16> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = extractelement <32 x i16> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_a = extractelement <8 x i16> undef, i32 %arg
@@ -357,29 +721,197 @@ define i32 @extract_i16(i32 %arg) {
 }
 
 define i32 @extract_i8(i32 %arg) {
-; CHECK-LABEL: 'extract_i8'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+; SSE2-LABEL: 'extract_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'extract_i8'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'extract_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE41-LABEL: 'extract_i8'
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SSE41-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'extract_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'extract_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'extract_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'extract_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = extractelement <16 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = extractelement <16 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = extractelement <16 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_a = extractelement <32 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = extractelement <32 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = extractelement <32 x i8> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = extractelement <32 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = extractelement <32 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = extractelement <32 x i8> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = extractelement <32 x i8> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_a = extractelement <64 x i8> undef, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = extractelement <64 x i8> undef, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = extractelement <64 x i8> undef, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = extractelement <64 x i8> undef, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = extractelement <64 x i8> undef, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = extractelement <64 x i8> undef, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = extractelement <64 x i8> undef, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = extractelement <64 x i8> undef, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = extractelement <64 x i8> undef, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = extractelement <64 x i8> undef, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; BTVER2-LABEL: 'extract_i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = extractelement <16 x i8> undef, i32 %arg
diff --git a/llvm/test/Analysis/DDG/basic-a.ll b/llvm/test/Analysis/DDG/basic-a.ll
index 920e71f6717be..a52e8c258f501 100644
--- a/llvm/test/Analysis/DDG/basic-a.ll
+++ b/llvm/test/Analysis/DDG/basic-a.ll
@@ -1,7 +1,44 @@
 ; RUN: opt < %s -disable-output "-passes=print<ddg>" 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: 'DDG' for loop 'test1.for.body':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
+
+; CHECK: Node Address:[[PI:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT: --- start of nodes in pi-block ---
+; CHECK-NEXT: Node Address:[[N10:0x[0-9a-f]*]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %inc = add i64 %i.02, 1
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N11:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N11]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test1.for.body ], [ 0, %test1.for.body.preheader ]
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N10]]
+; CHECK-NEXT: --- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N1:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N7]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %exitcond = icmp ne i64 %inc, %n
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    br i1 %exitcond, label %test1.for.body, label %for.end.loopexit
+; CHECK-NEXT: Edges:none!
+
+; CHECK: Node Address:[[N6]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %arrayidx1 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N1]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
 ; CHECK-NEXT: Edges:
@@ -23,12 +60,6 @@
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %add = fadd float %0, %conv
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N6:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx1 = getelementptr inbounds float, float* %a, i64 %i.02
-; CHECK-NEXT: Edges:
 ; CHECK-NEXT:  [def-use] to [[N5]]
 
 ; CHECK: Node Address:[[N5]]:single-instruction
@@ -36,36 +67,6 @@
 ; CHECK-NEXT:    store float %add, float* %arrayidx1, align 4
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N7:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %exitcond = icmp ne i64 %inc, %n
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N8]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %exitcond, label %test1.for.body, label %for.end.loopexit
-; CHECK-NEXT: Edges:none!
-
-; CHECK: Node Address:[[N9:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT: --- start of nodes in pi-block ---
-; CHECK-NEXT: Node Address:[[N10:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %i.02, 1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N11:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N11]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test1.for.body ], [ 0, %test1.for.body.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N10]]
-; CHECK-NEXT: --- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N1]]
-; CHECK-NEXT:  [def-use] to [[N6]]
-; CHECK-NEXT:  [def-use] to [[N7]]
-
 
 ;; No memory dependencies.
 ;; void test1(unsigned long n, float * restrict a, float * restrict b) {
@@ -96,78 +97,80 @@ for.end:                                          ; preds = %test1.for.body, %en
 
 
 ; CHECK-LABEL: 'DDG' for loop 'test2.for.body':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK: Node Address:[[PI:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT: --- start of nodes in pi-block ---
+; CHECK: Node Address:[[N11:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
+; CHECK-NEXT:    %inc = add i64 %i.02, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N4:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N12]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx1 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test2.for.body ], [ 0, %test2.for.body.preheader ]
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N11]]
+; CHECK-NEXT: --- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N1:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N5]]:single-instruction
+; CHECK: Node Address:[[N8]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %1 = load float, float* %arrayidx1, align 4
+; CHECK-NEXT:    %exitcond = icmp ne i64 %inc, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N3]]
-; CHECK-NEXT:  [memory] to [[N6:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N3]]:single-instruction
+; CHECK: Node Address:[[N9]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %add = fadd float %0, %1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N6]]
+; CHECK-NEXT:    br i1 %exitcond, label %test2.for.body, label %for.end.loopexit
+; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N7:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N7]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %arrayidx2 = getelementptr inbounds float, float* %a, i64 %i.02
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N6]]
+; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N6]]:single-instruction
+; CHECK: Node Address:[[N4]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    store float %add, float* %arrayidx2, align 4
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %arrayidx1 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N8:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N5]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %exitcond = icmp ne i64 %inc, %n
+; CHECK-NEXT:    %1 = load float, float* %arrayidx1, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
+; CHECK-NEXT:  [memory] to [[N6]]
 
-; CHECK: Node Address:[[N9]]:single-instruction
+; CHECK: Node Address:[[N1]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %exitcond, label %test2.for.body, label %for.end.loopexit
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N10:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT: --- start of nodes in pi-block ---
-; CHECK: Node Address:[[N11:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N2]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %i.02, 1
+; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N3]]
 
-; CHECK: Node Address:[[N12]]:single-instruction
+; CHECK: Node Address:[[N3]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test2.for.body ], [ 0, %test2.for.body.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N11]]
-; CHECK-NEXT: --- end of nodes in pi-block ---
+; CHECK-NEXT:    %add = fadd float %0, %1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N1]]
-; CHECK-NEXT:  [def-use] to [[N4]]
-; CHECK-NEXT:  [def-use] to [[N7]]
-; CHECK-NEXT:  [def-use] to [[N8]]
+; CHECK-NEXT:  [def-use] to [[N6]]
+
+; CHECK: Node Address:[[N6]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    store float %add, float* %arrayidx2, align 4
+; CHECK-NEXT: Edges:none!
+
 
 
 ;; Loop-independent memory dependencies.
diff --git a/llvm/test/Analysis/DDG/basic-b.ll b/llvm/test/Analysis/DDG/basic-b.ll
index f83f7fe92f3b3..757c706193a5b 100644
--- a/llvm/test/Analysis/DDG/basic-b.ll
+++ b/llvm/test/Analysis/DDG/basic-b.ll
@@ -1,19 +1,45 @@
 ; RUN: opt < %s -disable-output "-passes=print<ddg>" 2>&1 | FileCheck %s
 
 ; CHECK-LABEL: 'DDG' for loop 'test1.for.body':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
+
+; CHECK: Node Address:[[N9:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N13:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
+; CHECK-NEXT:    %inc = add i64 %i.02, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N14:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK: Node Address:[[N14]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
+; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test1.for.body ], [ 1, %test1.for.body.preheader ]
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N13]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N1:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N7]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %cmp = icmp ult i64 %inc, %sub
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    br i1 %cmp, label %test1.for.body, label %for.end.loopexit
+; CHECK-NEXT: Edges:none!
+
+; CHECK: Node Address:[[N6]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, float* %a, i64 %i.02
 ; CHECK-NEXT: Edges:
 ; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N4:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N4]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %sub1 = add i64 %i.02, -1
 ; CHECK-NEXT: Edges:
@@ -25,22 +51,17 @@
 ; CHECK-NEXT: Edges:
 ; CHECK-NEXT:  [def-use] to [[N3]]
 
-; CHECK: Node Address:[[N6:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N1]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N3]]
+; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N7:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N2]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp = icmp ult i64 %inc, %sub
+; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N8]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp, label %test1.for.body, label %for.end.loopexit
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:  [def-use] to [[N3]]
 
 ; CHECK: Node Address:[[N3]]:pi-block
 ; CHECK-NEXT: --- start of nodes in pi-block ---
@@ -64,25 +85,6 @@
 ; CHECK-NEXT:--- end of nodes in pi-block ---
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N9:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N13:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %i.02, 1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N14:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N14]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test1.for.body ], [ 1, %test1.for.body.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N13]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N1]]
-; CHECK-NEXT:  [def-use] to [[N4]]
-; CHECK-NEXT:  [def-use] to [[N6]]
-; CHECK-NEXT:  [def-use] to [[N7]]
 
 
 ;; Loop-carried dependence requiring edge-reversal to expose a cycle
@@ -117,19 +119,45 @@ for.end:                                          ; preds = %test1.for.body, %en
 }
 
 ; CHECK-LABEL: 'DDG' for loop 'test2.for.body':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
+
+; CHECK: Node Address:[[N11:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N12:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
+; CHECK-NEXT:    %inc = add i64 %i.02, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N13:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK: Node Address:[[N13]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
+; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test2.for.body ], [ 1, %test2.for.body.preheader ]
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N12]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N1:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N9]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %cmp = icmp ult i64 %inc, %sub
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N10:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N10]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    br i1 %cmp, label %test2.for.body, label %for.end.loopexit
+; CHECK-NEXT: Edges:none!
+
+; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N4:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N4]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %add1 = add i64 %i.02, 1
 ; CHECK-NEXT: Edges:
@@ -145,57 +173,33 @@ for.end:                                          ; preds = %test1.for.body, %en
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %1 = load float, float* %arrayidx2, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N3]]
-; CHECK-NEXT:  [memory] to [[N7:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
+; CHECK-NEXT:  [memory] to [[N7]]
 
-; CHECK: Node Address:[[N3]]:single-instruction
+; CHECK: Node Address:[[N1]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %add = fadd float %0, %1
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %i.02
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N7]]
+; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N8:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N2]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx3 = getelementptr inbounds float, float* %a, i64 %i.02
+; CHECK-NEXT:    %0 = load float, float* %arrayidx, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N7]]
+; CHECK-NEXT:  [def-use] to [[N3]]
 
-; CHECK: Node Address:[[N7]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    store float %add, float* %arrayidx3, align 4
-; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N9:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N3]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp = icmp ult i64 %inc, %sub
+; CHECK-NEXT:    %add = fadd float %0, %1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N10:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N7]]
 
-; CHECK: Node Address:[[N10]]:single-instruction
+; CHECK: Node Address:[[N7]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp, label %test2.for.body, label %for.end.loopexit
+; CHECK-NEXT:    store float %add, float* %arrayidx3, align 4
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N11:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N12:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %i.02, 1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N13:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N13]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.02 = phi i64 [ %inc, %test2.for.body ], [ 1, %test2.for.body.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N12]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N1]]
-; CHECK-NEXT:  [def-use] to [[N4]]
-; CHECK-NEXT:  [def-use] to [[N8]]
-; CHECK-NEXT:  [def-use] to [[N9]]
-
 
 ;; Forward loop-carried dependence *not* causing a cycle.
 ;; void test2(unsigned long n, float * restrict a, float * restrict b) {
diff --git a/llvm/test/Analysis/DDG/basic-loopnest.ll b/llvm/test/Analysis/DDG/basic-loopnest.ll
index aded488ef2365..41c2cbbdc7a15 100644
--- a/llvm/test/Analysis/DDG/basic-loopnest.ll
+++ b/llvm/test/Analysis/DDG/basic-loopnest.ll
@@ -2,73 +2,65 @@
 
 
 ; CHECK-LABEL: 'DDG' for loop 'test1.for.cond1.preheader':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %sub = add i64 %n, -1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
-; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N3]]:single-instruction
+; CHECK: Node Address:[[N28:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N29:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp21 = icmp ult i64 1, %sub
+; CHECK-NEXT:    %inc = add i64 %j.02, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N4]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp21, label %for.body4.preheader, label %for.inc12
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:  [def-use] to [[N30:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N5:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N30]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %0 = mul nsw i64 %i.04, %n
+; CHECK-NEXT:    %j.02 = phi i64 [ %inc, %for.body4 ], [ 1, %for.body4.preheader ]
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N6]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %0
+; CHECK-NEXT:  [def-use] to [[N29]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
 ; CHECK-NEXT: Edges:
 ; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N13:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N16:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N7]]:single-instruction
+; CHECK: Node Address:[[N13]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx5 = getelementptr inbounds float, float* %arrayidx, i64 %j.02
+; CHECK-NEXT:    %sub7 = add i64 %j.02, -1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK: Node Address:[[N25:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N26:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %1 = load float, float* %arrayidx5, align 4
+; CHECK-NEXT:    %inc13 = add i64 %i.04, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N27:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N10:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N27]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %2 = mul nsw i64 %i.04, %n
+; CHECK-NEXT:    %i.04 = phi i64 [ %inc13, %for.inc12 ], [ 0, %test1.for.cond1.preheader.preheader ]
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N11:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N11]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx6 = getelementptr inbounds float, float* %a, i64 %2
+; CHECK-NEXT:  [def-use] to [[N26]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N10:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N14:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N18:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N13:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N18]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %sub7 = add i64 %j.02, -1
+; CHECK-NEXT:    %exitcond = icmp ne i64 %inc13, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N12]]
+; CHECK-NEXT:  [def-use] to [[N19:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N12]]:single-instruction
+; CHECK: Node Address:[[N19]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx8 = getelementptr inbounds float, float* %arrayidx6, i64 %sub7
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9]]
+; CHECK-NEXT:    br i1 %exitcond, label %test1.for.cond1.preheader, label %for.end14.loopexit
+; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N14:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N14]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %4 = mul nsw i64 %i.04, %n
 ; CHECK-NEXT: Edges:
@@ -78,45 +70,55 @@
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %arrayidx10 = getelementptr inbounds float, float* %a, i64 %4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N16:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N16]]
 
 ; CHECK: Node Address:[[N16]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %arrayidx11 = getelementptr inbounds float, float* %arrayidx10, i64 %j.02
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9]]
+; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK: Node Address:[[N10]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp2 = icmp ult i64 %inc, %sub
+; CHECK-NEXT:    %2 = mul nsw i64 %i.04, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N17:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N11:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N17]]:single-instruction
+; CHECK: Node Address:[[N11]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp2, label %for.body4, label %for.inc12.loopexit
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %arrayidx6 = getelementptr inbounds float, float* %a, i64 %2
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N12]]
 
-; CHECK: Node Address:[[N18:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N12]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %exitcond = icmp ne i64 %inc13, %n
+; CHECK-NEXT:    %arrayidx8 = getelementptr inbounds float, float* %arrayidx6, i64 %sub7
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N19:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N9]]
 
-; CHECK: Node Address:[[N19]]:single-instruction
+; CHECK: Node Address:[[N5]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %exitcond, label %test1.for.cond1.preheader, label %for.end14.loopexit
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %0 = mul nsw i64 %i.04, %n
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N20:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N6]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br label %for.body4
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %0
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N7]]
 
-; CHECK: Node Address:[[N21:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N7]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br label %for.inc12
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %arrayidx5 = getelementptr inbounds float, float* %arrayidx, i64 %j.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %1 = load float, float* %arrayidx5, align 4
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N9]]
 
 ; CHECK: Node Address:[[N9]]:pi-block
 ; CHECK-NEXT:--- start of nodes in pi-block ---
@@ -140,46 +142,44 @@
 ; CHECK-NEXT:--- end of nodes in pi-block ---
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N25:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N26:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N21:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc13 = add i64 %i.04, 1
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N27:0x[0-9a-f]*]]
+; CHECK-NEXT:    br label %for.inc12
+; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N27]]:single-instruction
+; CHECK: Node Address:[[N20:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.04 = phi i64 [ %inc13, %for.inc12 ], [ 0, %test1.for.cond1.preheader.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N26]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N5]]
-; CHECK-NEXT:  [def-use] to [[N10]]
-; CHECK-NEXT:  [def-use] to [[N14]]
-; CHECK-NEXT:  [def-use] to [[N18]]
+; CHECK-NEXT:    br label %for.body4
+; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N28:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N29:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %j.02, 1
+; CHECK-NEXT:    %sub = add i64 %n, -1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N30:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N2]]
+; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N30]]:single-instruction
+; CHECK: Node Address:[[N3]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %j.02 = phi i64 [ %inc, %for.body4 ], [ 1, %for.body4.preheader ]
+; CHECK-NEXT:    %cmp21 = icmp ult i64 1, %sub
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N29]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
+; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N4]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    br i1 %cmp21, label %for.body4.preheader, label %for.inc12
+; CHECK-NEXT: Edges:none!
+
+; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %cmp2 = icmp ult i64 %inc, %sub
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N7]]
-; CHECK-NEXT:  [def-use] to [[N13]]
-; CHECK-NEXT:  [def-use] to [[N16]]
-; CHECK-NEXT:  [def-use] to [[N2]]
+; CHECK-NEXT:  [def-use] to [[N17:0x[0-9a-f]*]]
 
+; CHECK: Node Address:[[N17]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    br i1 %cmp2, label %for.body4, label %for.inc12.loopexit
+; CHECK-NEXT: Edges:none!
 
 
 ;; This test has a cycle.
@@ -232,49 +232,83 @@ for.end14:                                        ; preds = %for.inc12, %entry
 
 
 ; CHECK-LABEL: 'DDG' for loop 'test2.for.cond1.preheader':
-; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
+
+; CHECK: Node Address:[[PI1:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N28:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %sub = add i64 %n, -1
+; CHECK-NEXT:    %inc = add i64 %j.02, 1
 ; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N29:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N29]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %j.02 = phi i64 [ %inc, %for.body4 ], [ 1, %for.body4.preheader ]
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N28]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N13:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N18:0x[0-9a-f]*]]
 ; CHECK-NEXT:  [def-use] to [[N2:0x[0-9a-f]*]]
-; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N3]]:single-instruction
+; CHECK: Node Address:[[N13]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp21 = icmp ult i64 1, %sub
+; CHECK-NEXT:    %add7 = add i64 %j.02, 1
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N4]]:single-instruction
+; CHECK: Node Address:[[N24:0x[0-9a-f]*]]:pi-block
+; CHECK-NEXT:--- start of nodes in pi-block ---
+; CHECK: Node Address:[[N25:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp21, label %for.body4.preheader, label %for.inc12
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %inc13 = add i64 %i.04, 1
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N26:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N5:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N26]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %0 = mul nsw i64 %i.04, %n
+; CHECK-NEXT:    %i.04 = phi i64 [ %inc13, %for.inc12 ], [ 0, %test2.for.cond1.preheader.preheader ]
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N25]]
+; CHECK-NEXT:--- end of nodes in pi-block ---
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N5:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N10:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N16:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N20:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N6]]:single-instruction
+; CHECK: Node Address:[[N20]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %0
+; CHECK-NEXT:    %exitcond = icmp ne i64 %inc13, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N7:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N21:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N7]]:single-instruction
+; CHECK: Node Address:[[N21]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx5 = getelementptr inbounds float, float* %arrayidx, i64 %j.02
+; CHECK-NEXT:    br i1 %exitcond, label %test2.for.cond1.preheader, label %for.end14.loopexit
+; CHECK-NEXT: Edges:none!
+
+; CHECK: Node Address:[[N16]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %4 = mul nsw i64 %i.04, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N17:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N8]]:single-instruction
+; CHECK: Node Address:[[N17]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %1 = load float, float* %arrayidx5, align 4
+; CHECK-NEXT:    %arrayidx10 = getelementptr inbounds float, float* %a, i64 %4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N18]]
 
-; CHECK: Node Address:[[N10:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N18]]:single-instruction
+; CHECK-NEXT: Instructions:
+; CHECK-NEXT:    %arrayidx11 = getelementptr inbounds float, float* %arrayidx10, i64 %j.02
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N15:0x[0-9a-f]*]]
+
+; CHECK: Node Address:[[N10]]:single-instruction
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %2 = mul nsw i64 %i.04, %n
 ; CHECK-NEXT: Edges:
@@ -284,12 +318,6 @@ for.end14:                                        ; preds = %for.inc12, %entry
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %arrayidx6 = getelementptr inbounds float, float* %a, i64 %2
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N12:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N13:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %add7 = add i64 %j.02, 1
-; CHECK-NEXT: Edges:
 ; CHECK-NEXT:  [def-use] to [[N12]]
 
 ; CHECK: Node Address:[[N12]]:single-instruction
@@ -302,58 +330,47 @@ for.end14:                                        ; preds = %for.inc12, %entry
 ; CHECK-NEXT: Instructions:
 ; CHECK-NEXT:    %3 = load float, float* %arrayidx8, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N9]]
-; CHECK-NEXT:  [memory] to [[N15:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N9:0x[0-9a-f]*]]
+; CHECK-NEXT:  [memory] to [[N15]]
 
-; CHECK: Node Address:[[N9]]:single-instruction
+; CHECK: Node Address:[[N5]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %add = fadd float %1, %3
+; CHECK-NEXT:    %0 = mul nsw i64 %i.04, %n
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N15]]
+; CHECK-NEXT:  [def-use] to [[N6:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N16:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N6]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %4 = mul nsw i64 %i.04, %n
+; CHECK-NEXT:    %arrayidx = getelementptr inbounds float, float* %b, i64 %0
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N17:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N7]]
 
-; CHECK: Node Address:[[N17]]:single-instruction
+; CHECK: Node Address:[[N7]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx10 = getelementptr inbounds float, float* %a, i64 %4
+; CHECK-NEXT:    %arrayidx5 = getelementptr inbounds float, float* %arrayidx, i64 %j.02
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N18:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N8:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N18]]:single-instruction
+; CHECK: Node Address:[[N8]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %arrayidx11 = getelementptr inbounds float, float* %arrayidx10, i64 %j.02
+; CHECK-NEXT:    %1 = load float, float* %arrayidx5, align 4
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N15]]
-
-; CHECK: Node Address:[[N15]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    store float %add, float* %arrayidx11, align 4
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:  [def-use] to [[N9]]
 
-; CHECK: Node Address:[[N2]]:single-instruction
+; CHECK: Node Address:[[N9]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %cmp2 = icmp ult i64 %inc, %sub
+; CHECK-NEXT:    %add = fadd float %1, %3
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N19:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N15]]
 
-; CHECK: Node Address:[[N19]]:single-instruction
+; CHECK: Node Address:[[N15]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %cmp2, label %for.body4, label %for.inc12.loopexit
+; CHECK-NEXT:    store float %add, float* %arrayidx11, align 4
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N20:0x[0-9a-f]*]]:single-instruction
-; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %exitcond = icmp ne i64 %inc13, %n
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N21:0x[0-9a-f]*]]
-
-; CHECK: Node Address:[[N21]]:single-instruction
+; CHECK: Node Address:[[N23:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br i1 %exitcond, label %test2.for.cond1.preheader, label %for.end14.loopexit
+; CHECK-NEXT:    br label %for.inc12
 ; CHECK-NEXT: Edges:none!
 
 ; CHECK: Node Address:[[N22:0x[0-9a-f]*]]:single-instruction
@@ -361,50 +378,34 @@ for.end14:                                        ; preds = %for.inc12, %entry
 ; CHECK-NEXT:    br label %for.body4
 ; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N23:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N1:0x[0-9a-f]*]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    br label %for.inc12
-; CHECK-NEXT: Edges:none!
+; CHECK-NEXT:    %sub = add i64 %n, -1
+; CHECK-NEXT: Edges:
+; CHECK-NEXT:  [def-use] to [[N2]]
+; CHECK-NEXT:  [def-use] to [[N3:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N24:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N25:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N3]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc13 = add i64 %i.04, 1
+; CHECK-NEXT:    %cmp21 = icmp ult i64 1, %sub
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N26:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N4:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N26]]:single-instruction
+; CHECK: Node Address:[[N4]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %i.04 = phi i64 [ %inc13, %for.inc12 ], [ 0, %test2.for.cond1.preheader.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N25]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N5]]
-; CHECK-NEXT:  [def-use] to [[N10]]
-; CHECK-NEXT:  [def-use] to [[N16]]
-; CHECK-NEXT:  [def-use] to [[N20]]
+; CHECK-NEXT:    br i1 %cmp21, label %for.body4.preheader, label %for.inc12
+; CHECK-NEXT: Edges:none!
 
-; CHECK: Node Address:[[N27:0x[0-9a-f]*]]:pi-block
-; CHECK-NEXT:--- start of nodes in pi-block ---
-; CHECK: Node Address:[[N28:0x[0-9a-f]*]]:single-instruction
+; CHECK: Node Address:[[N2]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %inc = add i64 %j.02, 1
+; CHECK-NEXT:    %cmp2 = icmp ult i64 %inc, %sub
 ; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N29:0x[0-9a-f]*]]
+; CHECK-NEXT:  [def-use] to [[N19:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N29]]:single-instruction
+; CHECK: Node Address:[[N19]]:single-instruction
 ; CHECK-NEXT: Instructions:
-; CHECK-NEXT:    %j.02 = phi i64 [ %inc, %for.body4 ], [ 1, %for.body4.preheader ]
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N28]]
-; CHECK-NEXT:--- end of nodes in pi-block ---
-; CHECK-NEXT: Edges:
-; CHECK-NEXT:  [def-use] to [[N7]]
-; CHECK-NEXT:  [def-use] to [[N13]]
-; CHECK-NEXT:  [def-use] to [[N18]]
-; CHECK-NEXT:  [def-use] to [[N2]]
+; CHECK-NEXT:    br i1 %cmp2, label %for.body4, label %for.inc12.loopexit
+; CHECK-NEXT: Edges:none!
 
 
 ;; This test has no cycles.
diff --git a/llvm/test/Analysis/DDG/root-node.ll b/llvm/test/Analysis/DDG/root-node.ll
index 34d6437ef9c01..868fb72d452a3 100644
--- a/llvm/test/Analysis/DDG/root-node.ll
+++ b/llvm/test/Analysis/DDG/root-node.ll
@@ -7,12 +7,11 @@
 ; CHECK-NEXT:  [rooted] to [[N1:0x[0-9a-f]*]]
 ; CHECK-NEXT:  [rooted] to [[N2:0x[0-9a-f]*]]
 
-; CHECK: Node Address:[[N1]]:pi-block
-; CHECK: %i2.03 = phi i64 [ 0, %for.body.lr.ph ], [ %inc2, %test1.for.body ]
-
 ; CHECK: Node Address:[[N2]]:pi-block
 ; CHECK: %i1.02 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %test1.for.body ]
 
+; CHECK: Node Address:[[N1]]:pi-block
+; CHECK: %i2.03 = phi i64 [ 0, %for.body.lr.ph ], [ %inc2, %test1.for.body ]
 
 ;; // Two separate components in the graph. Root node must link to both.
 ;; void test1(unsigned long n, float * restrict a, float * restrict b) {
diff --git a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
index 059bbaa3c4e74..3b1c43df5a701 100755
--- a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
+++ b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
@@ -8,7 +8,7 @@ define void @test() #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 40
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 39
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Bindings/OCaml/bitwriter.ml b/llvm/test/Bindings/OCaml/bitwriter.ml
index 28a61fee91b1b..17111bd3b51e0 100644
--- a/llvm/test/Bindings/OCaml/bitwriter.ml
+++ b/llvm/test/Bindings/OCaml/bitwriter.ml
@@ -17,7 +17,7 @@ let test x = if not x then exit 1 else ()
 let read_file name =
   let ic = open_in_bin name in
   let len = in_channel_length ic in
-  let buf = String.create len in
+  let buf = Bytes.create len in
 
   test ((input ic buf 0 len) = len);
 
@@ -46,4 +46,4 @@ let _ =
   test (file_buf = temp_bitcode m);
   test (file_buf = temp_bitcode ~unbuffered:false m);
   test (file_buf = temp_bitcode ~unbuffered:true m);
-  test (file_buf = Llvm.MemoryBuffer.as_string (Llvm_bitwriter.write_bitcode_to_memory_buffer m))
+  test (file_buf = Bytes.of_string (Llvm.MemoryBuffer.as_string (Llvm_bitwriter.write_bitcode_to_memory_buffer m)))
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-copy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-copy.mir
new file mode 100644
index 0000000000000..d0e9fd5cd1a0b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-copy.mir
@@ -0,0 +1,86 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - -march=aarch64 -run-pass=aarch64-prelegalizer-combiner %s | FileCheck %s
+
+# Make sure we don't lose the register bank constraints when
+# combining COPY instructions.
+---
+name:            test_none_none
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_none_none
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: $x0 = COPY [[COPY]](s64)
+    %0:_(s64) = COPY $x0
+    %1:_(s64) = COPY %0(s64)
+    $x0 = COPY %1(s64)
+...
+---
+name:            test_gpr_none
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_gpr_none
+    ; CHECK: [[COPY:%[0-9]+]]:gpr(s64) = COPY $x0
+    ; CHECK: $x0 = COPY [[COPY]](s64)
+    %0:gpr(s64) = COPY $x0
+    %1:_(s64) = COPY %0(s64)
+    $x0 = COPY %1(s64)
+...
+---
+name:            test_none_gpr
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_none_gpr
+    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr(s64) = COPY [[COPY]](s64)
+    ; CHECK: $x0 = COPY [[COPY1]](s64)
+    %0:_(s64) = COPY $x0
+    %1:gpr(s64) = COPY %0(s64)
+    $x0 = COPY %1(s64)
+...
+---
+name:            test_fpr_gpr
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fpr_gpr
+    ; CHECK: [[COPY:%[0-9]+]]:fpr(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr(s64) = COPY [[COPY]](s64)
+    ; CHECK: $x0 = COPY [[COPY1]](s64)
+    %0:fpr(s64) = COPY $x0
+    %1:gpr(s64) = COPY %0(s64)
+    $x0 = COPY %1(s64)
+...
+---
+name:            test_gpr64_gpr64_dst_no_llt
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_gpr64_gpr64_dst_no_llt
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64(s64) = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64 = COPY [[COPY]](s64)
+    ; CHECK: $x0 = COPY [[COPY1]]
+    %0:gpr64(s64) = COPY $x0
+    %1:gpr64 = COPY %0(s64)
+    $x0 = COPY %1
+...
+---
+name:            test_gpr64_gpr64_src_no_llt
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_gpr64_gpr64_src_no_llt
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: [[COPY1:%[0-9]+]]:gpr64(s64) = COPY [[COPY]]
+    ; CHECK: $x0 = COPY [[COPY1]](s64)
+    %0:gpr64 = COPY $x0
+    %1:gpr64(s64) = COPY %0
+    $x0 = COPY %1(s64)
+...
+---
+name:            test_gpr64_gpr64_both_no_llt
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_gpr64_gpr64_both_no_llt
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $x0
+    ; CHECK: $x0 = COPY [[COPY]]
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = COPY %0
+    $x0 = COPY %1
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-trunc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-trunc.mir
new file mode 100644
index 0000000000000..2ee372ada08be
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-trunc.mir
@@ -0,0 +1,22 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=aarch64 -run-pass=legalizer %s -o - -verify-machineinstrs | FileCheck %s
+---
+name:            test_load_trunc
+stack:
+  - { id: 0, type: default, offset: 0, size: 2,
+      alignment: 2, stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: name: test_load_trunc
+    ; CHECK: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[FRAME_INDEX]](p0) :: (load 2)
+    ; CHECK: [[TRUNC:%[0-9]+]]:_(s10) = G_TRUNC [[LOAD]](s16)
+    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[TRUNC]](s10)
+    ; CHECK: RET_ReallyLR implicit [[TRUNC1]](s1)
+    %0:_(p0) = G_FRAME_INDEX %stack.0
+    %1:_(s10) = G_LOAD %0(p0) :: (load 2)
+    %2:_(s1) = G_TRUNC %1(s10)
+    RET_ReallyLR implicit %2(s1)
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
index 587b519554a71..01e6bd820efbc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-with-no-legality-check.mir
@@ -1433,8 +1433,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLAv8i8_:%[0-9]+]]:fpr64 = MLAv8i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i8_]]
+    ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv8i8_:%[0-9]+]]:fpr64 = ADDv8i8 [[MULv8i8_]], [[COPY2]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i8_]]
     %4:fpr(<8 x s8>) = COPY $d2
     %3:fpr(<8 x s8>) = COPY $d1
     %2:fpr(<8 x s8>) = COPY $d0
@@ -1468,8 +1469,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLAv16i8_:%[0-9]+]]:fpr128 = MLAv16i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv16i8_]]
+    ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv16i8_:%[0-9]+]]:fpr128 = ADDv16i8 [[MULv16i8_]], [[COPY2]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv16i8_]]
     %4:fpr(<16 x s8>) = COPY $q2
     %3:fpr(<16 x s8>) = COPY $q1
     %2:fpr(<16 x s8>) = COPY $q0
@@ -1503,8 +1505,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLAv4i16_:%[0-9]+]]:fpr64 = MLAv4i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv4i16_]]
+    ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv4i16_:%[0-9]+]]:fpr64 = ADDv4i16 [[MULv4i16_]], [[COPY2]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv4i16_]]
     %4:fpr(<4 x s16>) = COPY $d2
     %3:fpr(<4 x s16>) = COPY $d1
     %2:fpr(<4 x s16>) = COPY $d0
@@ -1538,8 +1541,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLAv8i16_:%[0-9]+]]:fpr128 = MLAv8i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i16_]]
+    ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv8i16_:%[0-9]+]]:fpr128 = ADDv8i16 [[MULv8i16_]], [[COPY2]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i16_]]
     %4:fpr(<8 x s16>) = COPY $q2
     %3:fpr(<8 x s16>) = COPY $q1
     %2:fpr(<8 x s16>) = COPY $q0
@@ -1759,8 +1763,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLAv8i8_:%[0-9]+]]:fpr64 = MLAv8i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i8_]]
+    ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv8i8_:%[0-9]+]]:fpr64 = ADDv8i8 [[COPY2]], [[MULv8i8_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i8_]]
     %4:fpr(<8 x s8>) = COPY $d2
     %3:fpr(<8 x s8>) = COPY $d1
     %2:fpr(<8 x s8>) = COPY $d0
@@ -1794,8 +1799,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLAv16i8_:%[0-9]+]]:fpr128 = MLAv16i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv16i8_]]
+    ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv16i8_:%[0-9]+]]:fpr128 = ADDv16i8 [[COPY2]], [[MULv16i8_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv16i8_]]
     %4:fpr(<16 x s8>) = COPY $q2
     %3:fpr(<16 x s8>) = COPY $q1
     %2:fpr(<16 x s8>) = COPY $q0
@@ -1829,8 +1835,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLAv4i16_:%[0-9]+]]:fpr64 = MLAv4i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv4i16_]]
+    ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv4i16_:%[0-9]+]]:fpr64 = ADDv4i16 [[COPY2]], [[MULv4i16_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv4i16_]]
     %4:fpr(<4 x s16>) = COPY $d2
     %3:fpr(<4 x s16>) = COPY $d1
     %2:fpr(<4 x s16>) = COPY $d0
@@ -1864,8 +1871,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLAv8i16_:%[0-9]+]]:fpr128 = MLAv8i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLAv8i16_]]
+    ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[ADDv8i16_:%[0-9]+]]:fpr128 = ADDv8i16 [[COPY2]], [[MULv8i16_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[ADDv8i16_]]
     %4:fpr(<8 x s16>) = COPY $q2
     %3:fpr(<8 x s16>) = COPY $q1
     %2:fpr(<8 x s16>) = COPY $q0
@@ -2085,8 +2093,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLSv8i8_:%[0-9]+]]:fpr64 = MLSv8i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLSv8i8_]]
+    ; CHECK: [[MULv8i8_:%[0-9]+]]:fpr64 = MULv8i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[SUBv8i8_:%[0-9]+]]:fpr64 = SUBv8i8 [[COPY2]], [[MULv8i8_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[SUBv8i8_]]
     %4:fpr(<8 x s8>) = COPY $d2
     %3:fpr(<8 x s8>) = COPY $d1
     %2:fpr(<8 x s8>) = COPY $d0
@@ -2120,8 +2129,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLSv16i8_:%[0-9]+]]:fpr128 = MLSv16i8 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLSv16i8_]]
+    ; CHECK: [[MULv16i8_:%[0-9]+]]:fpr128 = MULv16i8 [[COPY1]], [[COPY]]
+    ; CHECK: [[SUBv16i8_:%[0-9]+]]:fpr128 = SUBv16i8 [[COPY2]], [[MULv16i8_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[SUBv16i8_]]
     %4:fpr(<16 x s8>) = COPY $q2
     %3:fpr(<16 x s8>) = COPY $q1
     %2:fpr(<16 x s8>) = COPY $q0
@@ -2155,8 +2165,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr64 = COPY $d2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr64 = COPY $d1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr64 = COPY $d0
-    ; CHECK: [[MLSv4i16_:%[0-9]+]]:fpr64 = MLSv4i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLSv4i16_]]
+    ; CHECK: [[MULv4i16_:%[0-9]+]]:fpr64 = MULv4i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[SUBv4i16_:%[0-9]+]]:fpr64 = SUBv4i16 [[COPY2]], [[MULv4i16_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[SUBv4i16_]]
     %4:fpr(<4 x s16>) = COPY $d2
     %3:fpr(<4 x s16>) = COPY $d1
     %2:fpr(<4 x s16>) = COPY $d0
@@ -2190,8 +2201,9 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q2
     ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1
     ; CHECK: [[COPY2:%[0-9]+]]:fpr128 = COPY $q0
-    ; CHECK: [[MLSv8i16_:%[0-9]+]]:fpr128 = MLSv8i16 [[COPY2]], [[COPY1]], [[COPY]]
-    ; CHECK: $noreg = PATCHABLE_RET [[MLSv8i16_]]
+    ; CHECK: [[MULv8i16_:%[0-9]+]]:fpr128 = MULv8i16 [[COPY1]], [[COPY]]
+    ; CHECK: [[SUBv8i16_:%[0-9]+]]:fpr128 = SUBv8i16 [[COPY2]], [[MULv8i16_]]
+    ; CHECK: $noreg = PATCHABLE_RET [[SUBv8i16_]]
     %4:fpr(<8 x s16>) = COPY $q2
     %3:fpr(<8 x s16>) = COPY $q1
     %2:fpr(<8 x s16>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index b0a4256552726..ec3b51bd37a8d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=arm64-apple-ios7.0.0 -mcpu=cyclone -enable-misched=false | FileCheck %s
 
 ; rdar://13625505
@@ -5,15 +6,25 @@
 ; varargs start right after at 8-byte alignment.
 define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
 ; CHECK-LABEL: fn9:
-; 9th fixed argument
-; CHECK: ldr {{w[0-9]+}}, [sp, #64]
-; CHECK-DAG: add [[ARGS:x[0-9]+]], sp, #72
-; First vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #72]
-; Second vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #80]
-; Third vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #88]
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #64 ; =64
+; CHECK-NEXT:    ldr w8, [sp, #64]
+; CHECK-NEXT:    stp w2, w1, [sp, #52]
+; CHECK-NEXT:    stp w4, w3, [sp, #44]
+; CHECK-NEXT:    stp w6, w5, [sp, #36]
+; CHECK-NEXT:    str w7, [sp, #32]
+; CHECK-NEXT:    str w8, [x0]
+; CHECK-NEXT:    ldr w8, [sp, #72]
+; CHECK-NEXT:    str w8, [sp, #20]
+; CHECK-NEXT:    ldr w8, [sp, #80]
+; CHECK-NEXT:    str w8, [sp, #16]
+; CHECK-NEXT:    add x8, sp, #72 ; =72
+; CHECK-NEXT:    add x8, x8, #24 ; =24
+; CHECK-NEXT:    str x8, [sp, #24]
+; CHECK-NEXT:    ldr w8, [sp, #88]
+; CHECK-NEXT:    str w8, [sp, #12]
+; CHECK-NEXT:    add sp, sp, #64 ; =64
+; CHECK-NEXT:    ret
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
@@ -51,9 +62,47 @@ declare void @llvm.va_start(i8*) nounwind
 
 define i32 @main() nounwind ssp {
 ; CHECK-LABEL: main:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp, #8]
-; CHECK: str {{w[0-9]+}}, [sp]
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #96 ; =96
+; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    str w8, [sp, #76]
+; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    str w8, [sp, #72]
+; CHECK-NEXT:    mov w8, #3
+; CHECK-NEXT:    str w8, [sp, #68]
+; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    str w8, [sp, #64]
+; CHECK-NEXT:    mov w8, #5
+; CHECK-NEXT:    str w8, [sp, #60]
+; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    str w8, [sp, #56]
+; CHECK-NEXT:    mov w8, #7
+; CHECK-NEXT:    str w8, [sp, #52]
+; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    str w8, [sp, #48]
+; CHECK-NEXT:    mov w8, #9
+; CHECK-NEXT:    mov w9, #10
+; CHECK-NEXT:    stp w9, w8, [sp, #40]
+; CHECK-NEXT:    mov w10, #11
+; CHECK-NEXT:    mov w11, #12
+; CHECK-NEXT:    stp w11, w10, [sp, #32]
+; CHECK-NEXT:    stp x10, x11, [sp, #16]
+; CHECK-NEXT:    str x9, [sp, #8]
+; CHECK-NEXT:    str w8, [sp]
+; CHECK-NEXT:    add x0, sp, #76 ; =76
+; CHECK-NEXT:    mov w1, #2
+; CHECK-NEXT:    mov w2, #3
+; CHECK-NEXT:    mov w3, #4
+; CHECK-NEXT:    mov w4, #5
+; CHECK-NEXT:    mov w5, #6
+; CHECK-NEXT:    mov w6, #7
+; CHECK-NEXT:    mov w7, #8
+; CHECK-NEXT:    bl _fn9
+; CHECK-NEXT:    mov w0, #0
+; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #96 ; =96
+; CHECK-NEXT:    ret
   %a1 = alloca i32, align 4
   %a2 = alloca i32, align 4
   %a3 = alloca i32, align 4
@@ -97,12 +146,20 @@ define i32 @main() nounwind ssp {
 ;rdar://13668483
 @.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
 define void @foo(i8* %fmt, ...) nounwind {
-entry:
 ; CHECK-LABEL: foo:
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #23
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #48 ; =48
+; CHECK-NEXT:    ldr w8, [sp, #48]
+; CHECK-NEXT:    str w8, [sp, #28]
+; CHECK-NEXT:    add x8, sp, #48 ; =48
+; CHECK-NEXT:    add x8, x8, #23 ; =23
+; CHECK-NEXT:    and x8, x8, #0xfffffffffffffff0
+; CHECK-NEXT:    add x9, x8, #16 ; =16
+; CHECK-NEXT:    stp x9, x0, [sp, #32]
+; CHECK-NEXT:    ldr q0, [x8]
+; CHECK-NEXT:    str q0, [sp], #48
+; CHECK-NEXT:    ret
+entry:
   %fmt.addr = alloca i8*, align 8
   %args = alloca i8*, align 8
   %vc = alloca i32, align 4
@@ -118,10 +175,24 @@ entry:
 }
 
 define void @bar(i32 %x, <4 x i32> %y) nounwind {
-entry:
 ; CHECK-LABEL: bar:
-; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #80 ; =80
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str w0, [sp, #60]
+; CHECK-NEXT:    stp q0, q0, [sp, #16]
+; CHECK-NEXT:    str x0, [sp]
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x0, l_.str@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    add x0, x0, l_.str@PAGEOFF
+; CHECK-NEXT:    bl _foo
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80 ; =80
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
+entry:
   %x.addr = alloca i32, align 4
   %y.addr = alloca <4 x i32>, align 16
   store i32 %x, i32* %x.addr, align 4
@@ -137,12 +208,20 @@ entry:
 ; side is 16-byte aligned on stack.
 %struct.s41 = type { i32, i16, i32, i16 }
 define void @foo2(i8* %fmt, ...) nounwind {
-entry:
 ; CHECK-LABEL: foo2:
-; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #23
-; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
-; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #48 ; =48
+; CHECK-NEXT:    ldr w8, [sp, #48]
+; CHECK-NEXT:    str w8, [sp, #28]
+; CHECK-NEXT:    add x8, sp, #48 ; =48
+; CHECK-NEXT:    add x8, x8, #23 ; =23
+; CHECK-NEXT:    and x8, x8, #0xfffffffffffffff0
+; CHECK-NEXT:    add x9, x8, #16 ; =16
+; CHECK-NEXT:    stp x9, x0, [sp, #32]
+; CHECK-NEXT:    ldr q0, [x8]
+; CHECK-NEXT:    str q0, [sp], #48
+; CHECK-NEXT:    ret
+entry:
   %fmt.addr = alloca i8*, align 8
   %args = alloca i8*, align 8
   %vc = alloca i32, align 4
@@ -168,10 +247,25 @@ entry:
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 
 define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
-entry:
 ; CHECK-LABEL: bar2:
-; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
-; CHECK: str {{x[0-9]+}}, [sp]
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #80 ; =80
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    ; kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    str w0, [sp, #60]
+; CHECK-NEXT:    stp x1, x2, [sp, #32]
+; CHECK-NEXT:    stp x1, x2, [sp, #16]
+; CHECK-NEXT:    str x0, [sp]
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x0, l_.str@PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    add x0, x0, l_.str@PAGEOFF
+; CHECK-NEXT:    bl _foo2
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80 ; =80
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpAdd Lloh2, Lloh3
+entry:
   %x.addr = alloca i32, align 4
   %s41 = alloca %struct.s41, align 16
   store i32 %x, i32* %x.addr, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-never-combine-csr-local-stack-bump-for-size.ll b/llvm/test/CodeGen/AArch64/arm64-never-combine-csr-local-stack-bump-for-size.ll
deleted file mode 100644
index 273fb31e16c3b..0000000000000
--- a/llvm/test/CodeGen/AArch64/arm64-never-combine-csr-local-stack-bump-for-size.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-post-ra | FileCheck %s
-
-; CHECK-LABEL: main:
-; CHECK:       stp     x29, x30, [sp, #-16]!
-; CHECK-NEXT:  stp     xzr, xzr, [sp, #-16]!
-; CHECK:       adrp    x0, l_.str@PAGE
-; CHECK:       add     x0, x0, l_.str@PAGEOFF
-; CHECK-NEXT:  bl      _puts
-; CHECK-NEXT:   add     sp, sp, #16
-; CHECK-NEXT:	ldp	x29, x30, [sp], #16
-; CHECK-NEXT:	ret
-
-@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
-
-define i32 @main() nounwind ssp optsize {
-entry:
-  %local1 = alloca i64, align 8
-  %local2 = alloca i64, align 8
-  store i64 0, i64* %local1
-  store i64 0, i64* %local2
-  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str, i32 0, i32 0))
-  ret i32 %call
-}
-
-declare i32 @puts(i8*)
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
new file mode 100644
index 0000000000000..820d08bd94b4d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-cfi.ll
@@ -0,0 +1,80 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+; Function a's outlining candidate contains a sp modifying add without a
+; corresponsing sub, so we shouldn't outline it.
+define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:            // %bb.0:
+; CHECK-NEXT:           .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK-NOT:          bl OUTLINED_FUNCTION_{{[0-9]+}}
+; CHECK:              autibsp
+; CECK-NEXT:          ret
+  ret void
+}
+
+define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" nounwind {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK-NEXT:       // %bb.0:
+; CHECK-NEXT:           pacibsp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                bl [[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
+; CHECK:                autibsp
+; CHECK-NEXT:           ret
+  ret void
+}
+
+define void @c() "sign-return-address"="all" "sign-return-address-key"="b_key" nounwind {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK-NEXT:       // %bb.0:
+; CHECK-NEXT:           pacibsp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                bl [[OUTLINED_FUNC]]
+; CHECK:                autibsp
+; CHECK-NEXT:           ret
+  ret void
+}
+
+; CHECK:            [[OUTLINED_FUNC]] 
+; CHECK:            // %bb.0:
+; CHECK-NEXT:           .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+; CHECK:                autibsp
+; CHECK-NEXT:           ret
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
new file mode 100644
index 0000000000000..d8acaa9cbfd8e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-diff-scope-same-key.ll
@@ -0,0 +1,68 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+define void @a() "sign-return-address"="all" {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:            paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:          autiasp
+  ret void
+; CHECK:          .cfi_endproc
+}
+
+define void @b() "sign-return-address"="non-leaf" {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK-NOT:        paciasp
+; CHECK-NOT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK-NOT:        autiasp
+  ret void
+; CHECK:            .cfi_endproc
+}
+
+define void @c() "sign-return-address"="all" {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:            paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:          autiasp
+  ret void
+; CHECK:          .cfi_endproc
+}
+
+; CHECK-NOT:      OUTLINED_FUNCTION_{{[0-9]+}}:
+; CHECK-NOT:      // -- Begin function
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
new file mode 100644
index 0000000000000..c7cea17e7cf2d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-non-leaf.ll
@@ -0,0 +1,72 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+define i64 @a(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:                .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+define i64 @b(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK:                .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+define i64 @c(i64 %x) "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:                .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+  call void asm sideeffect "mov x30, $0", "r,~{lr}"(i64 %x) #1
+  ret i64 %x
+}
+
+; Outlined function is leaf-function => don't sign it
+; CHECK-LABEL:      OUTLINED_FUNCTION_0:
+; CHECK-NOT:            .cfi_b_key_frame
+; CHECK-NOT:            paci{{[a,b]}}sp
+; CHECK-NOT:            .cfi_negate_ra_state
+; CHECK-NOT:            auti{{[a,b]}}sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
new file mode 100644
index 0000000000000..e65adce5c1b4e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-regsave.mir
@@ -0,0 +1,127 @@
+# RUN: llc -mtriple=aarch64-arm-none-eabi -run-pass=prologepilog \
+# RUN: -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Check that we save LR to a callee-saved register when possible.
+# foo() should use a callee-saved register. However, bar() should not.
+--- |
+
+  define void @foo() #0 {
+    ret void
+  }
+
+  define void @bar() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind "sign-return-address"="non-leaf" "sign-return-address-key"="b_key" minsize noinline noredzone "no-frame-pointer-elim"="true" }
+...
+---
+# CHECK-LABEL:   name:            foo
+# CHECK:         bb.0:
+# CHECK:            frame-setup EMITBKEY
+# CHECK-NEXT:       frame-setup PACIBSP
+# CHECK-NEXT:       frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:        bb.1:
+# CHECK:            BL @[[OUTLINED_FUNCTION:OUTLINED_FUNCTION_[0-9]+]]
+# CHECK:        bb.2:
+# CHECK:            BL @[[OUTLINED_FUNCTION]]
+# CHECK:        bb.3:
+# CHECK:            BL @[[OUTLINED_FUNCTION]]
+# CHECK:        bb.4:
+# CHECK:            BL @[[OUTLINED_FUNCTION]]
+# CHECK:        bb.5:
+# CHECK:            frame-destroy AUTIBSP
+# CHECK-NEXT:       RET
+name:            foo
+tracksRegLiveness: true
+fixedStack:
+body:             |
+  bb.0:
+    $x25 = ORRXri $xzr, 1
+    $lr = ORRXri $xzr, 1
+  bb.1:
+    liveins: $lr, $w9
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 2
+  bb.2:
+    liveins: $lr, $w9
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 2
+  bb.3:
+    liveins: $lr, $w9
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 2
+  bb.4:
+    liveins: $lr, $w9
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 1
+    $w9 = ORRWri $wzr, 2
+  bb.5:
+    liveins: $w9
+    RET undef $lr
+
+...
+---
+# CHECK:         name:            bar
+# CHECK:          bb.0:
+# CHECK-NOT:        OUTLINED_FUNCTION_
+# CHECK:          bb.1:
+# CHECK-NOT:        OUTLINED_FUNCTION_
+# CHECK:          bb.2:
+# CHECK-NOT:        OUTLINED_FUNCTION_
+# CHECK:          bb.3:
+# CHECK-NOT:        OUTLINED_FUNCTION_
+# CHECK:            RET
+name:            bar
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 2
+  bb.1:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 2
+  bb.2:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w10 = ORRWri $wzr, 1
+    $w12 = ORRWri $wzr, 2
+  bb.3:
+    liveins: $lr, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x19, $x20, $x21, $x22, $x23, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27, $x28
+    RET undef $lr
+
+# CHECK:         name:            [[OUTLINED_FUNCTION]]
+# CHECK:         body:
+# CHECK-NEXT:     bb.0:
+# CHECK-NOT:        frame-setup EMITBKEY
+# CHECK-NOT:        frame-setup PACI{{[A,B]]}}SP
+# CHECK-NOT:        frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NOT:        frame-destroy AUTI{{[A,B]]}}SP
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
new file mode 100644
index 0000000000000..4348d73743067
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-diff-key.ll
@@ -0,0 +1,69 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+define void @a() "sign-return-address"="all" {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:            paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:          autiasp
+  ret void
+; CHECK:          .cfi_endproc
+}
+
+define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK:            .cfi_b_key_frame
+; CHECK-NEXT:       pacibsp
+; CHECK-NEXT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK-NOT:        autiasp
+  ret void
+; CHECK:            .cfi_endproc
+}
+
+define void @c() "sign-return-address"="all" {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:            paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:          autiasp
+  ret void
+; CHECK:          .cfi_endproc
+}
+
+; CHECK-NOT:      OUTLINED_FUNCTION_0:
+; CHECK-NOT:      // -- Begin function
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-a.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-a.ll
new file mode 100644
index 0000000000000..f5e229a20ef28
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-a.ll
@@ -0,0 +1,64 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+define void @a() "sign-return-address"="all" "sign-return-address-key"="a_key" nounwind {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:                paciasp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:              autiasp
+  ret void
+}
+
+define void @b() "sign-return-address"="all" nounwind {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK:                paciasp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                autiasp
+  ret void
+}
+
+define void @c() "sign-return-address"="all" nounwind {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:                paciasp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:              autiasp
+  ret void
+}
+
+; CHECK-LABEL:      OUTLINED_FUNCTION_0:
+; CHECK:              paciasp
+; CHECK:              autiasp
+; CHECK-NEXT:         ret
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-b.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-b.ll
new file mode 100644
index 0000000000000..c1940b44d2dad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-same-scope-same-key-b.ll
@@ -0,0 +1,70 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-none-eabi %s -o - | FileCheck %s
+
+define void @a() "sign-return-address"="all" "sign-return-address-key"="b_key" nounwind {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK-NEXT:       // %bb.0:
+; CHECK-NEXT:           pacibsp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:              autibsp
+  ret void
+}
+
+define void @b() "sign-return-address"="all" "sign-return-address-key"="b_key" nounwind {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK-NEXT:       // %bb.0:
+; CHECK-NEXT:           pacibsp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                autibsp
+  ret void
+}
+
+define void @c() "sign-return-address"="all" "sign-return-address-key"="b_key" nounwind {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK-NEXT:       // %bb.0:
+; CHECK-NEXT:           pacibsp
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                autibsp
+  ret void
+}
+
+; CHECK-LABEL:      OUTLINED_FUNCTION_0:
+; CHECK:            // %bb.0:
+; CHECK-NEXT:           .cfi_b_key_frame
+; CHECK-NEXT:           pacibsp
+; CHECK-NEXT:           .cfi_negate_ra_state
+; CHECK:                autibsp
+; CHECK-NEXT:           ret
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
new file mode 100644
index 0000000000000..2645a6553ffd8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-sp-mod.mir
@@ -0,0 +1,204 @@
+# RUN: llc -verify-machineinstrs -run-pass=machine-outliner %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-arm-linux-gnu"
+  
+  @v = common dso_local global i32* null, align 8
+  
+  ; Function Attrs: nounwind
+  define dso_local void @legal0() #0 {
+    %1 = alloca i32, align 4
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define dso_local void @legal1() #0 {
+    %1 = alloca i32, align 4
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    ret void
+  }
+
+  ; Function Attrs: nounwind
+  define dso_local void @illegal0() #0 {
+    %1 = alloca i32, align 4
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  define dso_local void @illegal1() #0 {
+    %1 = alloca i32, align 4
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    store volatile i32* %1, i32** @v, align 8
+    ret void
+  }
+
+  attributes #0 = { nounwind "sign-return-address"="all" "sign-return-address-key"="a_key" noinline noredzone "no-frame-pointer-elim"="true" }
+
+...
+---
+name:            legal0
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+  liveins: $lr
+    frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-setup CFI_INSTRUCTION negate_ra_sign_state 
+    $sp = frame-setup SUBXri $sp, 16, 0
+    renamable $x8 = ADRP target-flags(aarch64-page) @v
+    $x9 = ADDXri $sp, 12, 0
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    $sp = frame-destroy ADDXri $sp, 16, 0
+    frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    RET undef $lr
+
+# CHECK-LABEL:    name:            legal0
+# CHECK:          body:             |
+# CHECK-NEXT:         bb.0 (%ir-block.0):
+# CHECK-NEXT:           liveins: $lr
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                BL @[[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           RET undef $lr
+
+...
+---
+name:            legal1
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+  liveins: $lr
+    frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-setup CFI_INSTRUCTION negate_ra_sign_state 
+    $sp = frame-setup SUBXri $sp, 16, 0
+    renamable $x8 = ADRP target-flags(aarch64-page) @v
+    $x9 = ADDXri $sp, 12, 0
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    $sp = frame-destroy ADDXri $sp, 16, 0
+    frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    RET undef $lr
+
+# CHECK-LABEL:    name:            legal1
+# CHECK:          body:             |
+# CHECK-NEXT:         bb.0 (%ir-block.0):
+# CHECK-NEXT:           liveins: $lr
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK:                BL @[[OUTLINED_FUNC]]
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           RET undef $lr
+
+...
+---
+name:            illegal0
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+  liveins: $lr
+    frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-setup CFI_INSTRUCTION negate_ra_sign_state 
+    $sp = frame-setup SUBXri $sp, 16, 0
+    renamable $x8 = ADRP target-flags(aarch64-page) @v
+    $x9 = ADDXri $sp, 12, 0
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    $sp = frame-destroy ADDXri $sp, 12, 0
+    frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    RET undef $lr
+
+...
+---
+name:            illegal1
+tracksRegLiveness: true
+body:             |
+  bb.0 (%ir-block.0):
+  liveins: $lr
+    frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    frame-setup CFI_INSTRUCTION negate_ra_sign_state 
+    $sp = frame-setup SUBXri $sp, 16, 0
+    renamable $x8 = ADRP target-flags(aarch64-page) @v
+    $x9 = ADDXri $sp, 12, 0
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui renamable $x9, renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    STRXui killed renamable $x9, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @v :: (volatile store 8 into @v)
+    $sp = frame-destroy ADDXri $sp, 12, 0
+    frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+    RET undef $lr
+
+# CHECK-LABEL:    name:            illegal0
+# CHECK:          body:             |
+# CHECK-NEXT:         bb.0 (%ir-block.0):
+# CHECK-NEXT:           liveins: $lr
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           RET undef $lr 
+
+# CHECK-LABEL:    name:            illegal1
+# CHECK:          body:             |
+# CHECK-NEXT:         bb.0 (%ir-block.0):
+# CHECK-NEXT:           liveins: $lr
+# CHECK:                frame-setup PACIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NOT:            BL @OUTLINED_FUNCTION_{{.*}}
+# CHECK:                frame-destroy AUTIASP implicit-def $lr, implicit killed $lr, implicit $sp
+# CHECK-NEXT:           RET undef $lr 
+
+# Outlined function that contains only legal sp modifications
+# CHECK:          name:            [[OUTLINED_FUNC]]
+# CHECK:          body:             |
+# CHECK-NEXT:       bb.0:
+# CHECK-NEXT:         frame-setup PACIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:         frame-setup CFI_INSTRUCTION negate_ra_sign_state
+# CHECK-NEXT:         $sp = frame-setup SUBXri $sp, 16, 0
+# CHECK:              $sp = frame-destroy ADDXri $sp, 16, 0
+# CHECK-NEXT:         frame-destroy AUTIASP implicit-def $lr, implicit $lr, implicit $sp
+# CHECK-NEXT:         RET undef $lr
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
new file mode 100644
index 0000000000000..c2bb291506ae2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-subtarget.ll
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-linux-gnu %s -o - | FileCheck %s
+
+; Check that functions that should sign their return addresses don't get
+; outlined if not all of the function either support v8.3a features or all of
+; the functions don't!!
+
+define void @a() #0 {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               .cfi_b_key_frame
+; CHECK-NEXT:               pacibsp
+; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NOT:                OUTLINED_FUNCTION_
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  retab
+; CHECK-NOT:              auti[a,b]sp
+  ret void
+}
+
+define void @b() #0 {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               .cfi_b_key_frame
+; CHECK-NEXT:               pacibsp
+; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NOT:                OUTLINED_FUNCTION_
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  retab
+; CHECK-NOT:              auti[a,b]sp
+  ret void
+}
+
+define void @c() #1 {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               .cfi_b_key_frame
+; CHECK-NEXT:               pacibsp
+; CHECK-NEXT:               .cfi_negate_ra_state
+; CHECK-NOT:                OUTLINED_FUNCTION_
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  autibsp
+; CHECK-NOT:              ret{{[a,b]}}
+  ret void
+}
+
+attributes #0 = { "sign-return-address"="all"
+                  "sign-return-address-key"="b_key"
+                  "target-features"="+v8.3a" }
+
+attributes #1 = { "sign-return-address"="all"
+                  "sign-return-address-key"="b_key" }
+
+; CHECK-NOT:                OUTLINED_FUNCTION_
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
new file mode 100644
index 0000000000000..d76dc5ef8a8e7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple aarch64-arm-linux-gnu --enable-machine-outliner \
+; RUN: -verify-machineinstrs %s -o - | FileCheck %s
+
+declare i32 @thunk_called_fn(i32, i32, i32, i32)
+
+define i32 @a() #0 {
+; CHECK-LABEL:  a:                                      // @a
+; CHECK:        // %bb.0:                               // %entry
+; CHECK-NEXT:       paciasp
+; CHECK:            autiasp
+; CHECK-NEXT:       ret
+entry:
+  %call = tail call i32 @thunk_called_fn(i32 1, i32 2, i32 3, i32 4)
+  %cx = add i32 %call, 8
+  ret i32 %cx
+}
+
+define i32 @b() #0 {
+; CHECK-LABEL:  b:                                      // @b
+; CHECK:        // %bb.0:                               // %entry
+; CHECK-NEXT:       paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+; CHECK:            autiasp
+; CHECK-NEXT:       ret
+entry:
+  %call = tail call i32 @thunk_called_fn(i32 1, i32 2, i32 3, i32 4)
+  %cx = add i32 %call, 88
+  ret i32 %cx
+}
+
+define hidden i32 @c(i32 (i32, i32, i32, i32)* %fptr) #0 {
+; CHECK-LABEL:  c:                                      // @c
+; CHECK:        // %bb.0:                               // %entry
+; CHECK-NEXT:       paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+; CHECK:            autiasp
+; CHECK-NEXT:       ret
+entry:
+  %call = tail call i32 %fptr(i32 1, i32 2, i32 3, i32 4)
+  %add = add nsw i32 %call, 8
+  ret i32 %add
+}
+
+define hidden i32 @d(i32 (i32, i32, i32, i32)* %fptr) #0 {
+; CHECK-LABEL:  d:                                      // @d
+; CHECK:        // %bb.0:                               // %entry
+; CHECK-NEXT:       paciasp
+; CHECK-NEXT:       .cfi_negate_ra_state
+; CHECK:            autiasp
+; CHECK-NEXT:       ret
+entry:
+  %call = tail call i32 %fptr(i32 1, i32 2, i32 3, i32 4)
+  %add = add nsw i32 %call, 88
+  ret i32 %add
+}
+
+attributes #0 = { "sign-return-address"="non-leaf" }
+
+; CHECK-NOT:        [[OUTLINED_FUNCTION_{{.*}}]]
+; CHECK-NOT:         .cfi_b_key_frame
+; CHECK-NOT:         paci{{[a,b]}}sp
+; CHECK-NOT:         .cfi_negate_ra_state
+; CHECK-NOT:         auti{{[a,b]}}sp
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-v8-3.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-v8-3.ll
new file mode 100644
index 0000000000000..05f4dc2e8c2cf
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-v8-3.ll
@@ -0,0 +1,83 @@
+; RUN: llc -verify-machineinstrs -enable-machine-outliner -mtriple \
+; RUN: aarch64-arm-linux-gnu %s -o - | FileCheck %s
+
+; Check that outlined functions use the dedicated RETAA/RETAB instructions 
+; to sign their return address if available.
+
+define void @a() #0 {
+; CHECK-LABEL:      a:                                     // @a
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               pacibsp
+; CHECK:                    bl [[OUTLINED_FUNC:OUTLINED_FUNCTION_[0-9]+]]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  retab
+; CHECK-NOT:              auti[a,b]sp
+  ret void
+}
+
+define void @b() #0 {
+; CHECK-LABEL:      b:                                     // @b
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               pacibsp
+; CHECK:                    bl OUTLINED_FUNC
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  retab
+; CHECK-NOT:              auti[a,b]sp
+  ret void
+}
+
+define void @c() #0 {
+; CHECK-LABEL:      c:                                     // @c
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               pacibsp
+; CHECK:                    bl OUTLINED_FUNC
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  store i32 1, i32* %1, align 4
+  store i32 2, i32* %2, align 4
+  store i32 3, i32* %3, align 4
+  store i32 4, i32* %4, align 4
+  store i32 5, i32* %5, align 4
+  store i32 6, i32* %6, align 4
+; CHECK:                  retab
+; CHECK-NOT:              auti[a,b]sp
+  ret void
+}
+
+attributes #0 = { "sign-return-address"="all"
+                  "sign-return-address-key"="b_key"
+                  "target-features"="+v8.3a"
+                  nounwind }
+
+; CHECK:            OUTLINED_FUNC
+; CHECK:            // %bb.0:
+; CHECK-NEXT:               .cfi_b_key_frame
+; CHECK-NEXT:               pacibsp
+; CHECK:                    retab
+; CHECK-NOT:                auti[a,b]sp
diff --git a/llvm/test/CodeGen/AArch64/macro-fusion.ll b/llvm/test/CodeGen/AArch64/macro-fusion.ll
index 97bca14df5790..b9a263fe2e23a 100644
--- a/llvm/test/CodeGen/AArch64/macro-fusion.ll
+++ b/llvm/test/CodeGen/AArch64/macro-fusion.ll
@@ -1,21 +1,18 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fuse-arith-logic -verify-misched -debug-only=machine-scheduler 2>&1 > /dev/null | FileCheck %s
 
-; Verify that, the macro-fusion creates the necessary dependencies between SUs.
+; Verify that, the macro-fusion creates the necessary dependencies between SUs and
+; only 2 SU's are fused at most.
 define signext i32 @test(i32 signext %a, i32 signext %b, i32 signext %c, i32 signext %d) {
 entry:
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: %bb.0 entry 
 ; CHECK: Macro fuse: SU([[SU4:[0-9]+]]) - SU([[SU5:[0-9]+]])
 ; CHECK: Bind SU([[SU1:[0-9]+]]) - SU([[SU4]])
-; CHECK: Macro fuse: SU([[SU5]]) - SU([[SU6:[0-9]+]])
-; CHECK: Bind SU([[SU0:[0-9]+]]) - SU([[SU5]])
-; CHECK: SU([[SU0]]):   %{{[0-9]+}}:gpr32 = COPY $w3
+; CHECK-NOT: Macro fuse:
 ; CHECK: SU([[SU1]]):   %{{[0-9]+}}:gpr32 = COPY $w2
 ; CHECK: SU([[SU4]]):   %{{[0-9]+}}:gpr32 = nsw ADDWrr
 ; CHECK: SU([[SU5]]):   %{{[0-9]+}}:gpr32 = nsw ADDWrr
-; CHECK: SU([[SU6]]):   %{{[0-9]+}}:gpr32 = nsw SUBWrr
-
   %add = add nsw i32 %b, %a
   %add1 = add nsw i32 %add, %c
   %sub = sub nsw i32 %add1, %d
diff --git a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
index 71bb0e70abfaa..a4b9ef8eff575 100644
--- a/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -1,85 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
 define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
-;CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: mla8xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i8> %A, %B;
 	%tmp2 = add <8 x i8> %C, %tmp1;
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-;CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: mla16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <16 x i8> %A, %B;
 	%tmp2 = add <16 x i8> %C, %tmp1;
 	ret <16 x i8> %tmp2
 }
 
 define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
-;CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: mla4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i16> %A, %B;
 	%tmp2 = add <4 x i16> %C, %tmp1;
 	ret <4 x i16> %tmp2
 }
 
 define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
-;CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: mla8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i16> %A, %B;
 	%tmp2 = add <8 x i16> %C, %tmp1;
 	ret <8 x i16> %tmp2
 }
 
 define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
-;CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: mla2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <2 x i32> %A, %B;
 	%tmp2 = add <2 x i32> %C, %tmp1;
 	ret <2 x i32> %tmp2
 }
 
 define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
-;CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: mla4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mla v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i32> %A, %B;
 	%tmp2 = add <4 x i32> %C, %tmp1;
 	ret <4 x i32> %tmp2
 }
 
 define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
-;CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-LABEL: mls8xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.8b, v0.8b, v1.8b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i8> %A, %B;
 	%tmp2 = sub <8 x i8> %C, %tmp1;
 	ret <8 x i8> %tmp2
 }
 
 define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
-;CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; CHECK-LABEL: mls16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.16b, v0.16b, v1.16b
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <16 x i8> %A, %B;
 	%tmp2 = sub <16 x i8> %C, %tmp1;
 	ret <16 x i8> %tmp2
 }
 
 define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
-;CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+; CHECK-LABEL: mls4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.4h, v0.4h, v1.4h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i16> %A, %B;
 	%tmp2 = sub <4 x i16> %C, %tmp1;
 	ret <4 x i16> %tmp2
 }
 
 define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
-;CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+; CHECK-LABEL: mls8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <8 x i16> %A, %B;
 	%tmp2 = sub <8 x i16> %C, %tmp1;
 	ret <8 x i16> %tmp2
 }
 
 define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
-;CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK-LABEL: mls2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <2 x i32> %A, %B;
 	%tmp2 = sub <2 x i32> %C, %tmp1;
 	ret <2 x i32> %tmp2
 }
 
 define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
-;CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+; CHECK-LABEL: mls4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mls v2.4s, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
 	%tmp1 = mul <4 x i32> %A, %B;
 	%tmp2 = sub <4 x i32> %C, %tmp1;
 	ret <4 x i32> %tmp2
diff --git a/llvm/test/CodeGen/AArch64/neon-vcadd.ll b/llvm/test/CodeGen/AArch64/neon-vcadd.ll
new file mode 100644
index 0000000000000..11605267c09b4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-vcadd.ll
@@ -0,0 +1,67 @@
+; RUN: llc %s -mtriple=aarch64 -mattr=+v8.3a,+fullfp16 -o - | FileCheck %s
+
+define <4 x half> @foo16x4_rot(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x4_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, #270
+  %vcadd_rot90_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half> %a, <4 x half> %b)
+  %vcadd_rot270_v2.i = tail call <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half> %a, <4 x half> %b)
+  %add = fadd <4 x half> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+  ret <4 x half> %add
+}
+
+define <2 x float> @foo32x2_rot(<2 x float> %a, <2 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x2_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, #270
+  %vcadd_rot90_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float> %a, <2 x float> %b)
+  %vcadd_rot270_v2.i = tail call <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float> %a, <2 x float> %b)
+  %add = fadd <2 x float> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+  ret <2 x float> %add
+}
+
+define <8 x half> @foo16x8_rot(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x8_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, #270
+  %vcaddq_rot90_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half> %a, <8 x half> %b)
+  %vcaddq_rot270_v2.i = tail call <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half> %a, <8 x half> %b)
+  %add = fadd <8 x half> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+  ret <8 x half> %add
+}
+
+define <4 x float> @foo32x4_rot(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x4_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, #270
+  %vcaddq_rot90_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float> %a, <4 x float> %b)
+  %vcaddq_rot270_v2.i = tail call <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float> %a, <4 x float> %b)
+  %add = fadd <4 x float> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+  ret <4 x float> %add
+}
+
+define <2 x double> @foo64x2_rot(<2 x double> %a, <2 x double> %b) {
+entry:
+; CHECK-LABEL: foo64x2_rot
+; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #90
+; CHECK-DAG: fcadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, #270
+  %vcaddq_rot90_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double> %a, <2 x double> %b)
+  %vcaddq_rot270_v2.i = tail call <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double> %a, <2 x double> %b)
+  %add = fadd <2 x double> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+  ret <2 x double> %add
+}
+
+declare <4 x half> @llvm.aarch64.neon.vcadd.rot90.v4f16(<4 x half>, <4 x half>)
+declare <4 x half> @llvm.aarch64.neon.vcadd.rot270.v4f16(<4 x half>, <4 x half>)
+declare <2 x float> @llvm.aarch64.neon.vcadd.rot90.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.aarch64.neon.vcadd.rot270.v2f32(<2 x float>, <2 x float>)
+declare <8 x half> @llvm.aarch64.neon.vcadd.rot90.v8f16(<8 x half>, <8 x half>)
+declare <8 x half> @llvm.aarch64.neon.vcadd.rot270.v8f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.aarch64.neon.vcadd.rot90.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.aarch64.neon.vcadd.rot270.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vcadd.rot90.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.aarch64.neon.vcadd.rot270.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
new file mode 100644
index 0000000000000..67b54e46e36cb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/overeager_mla_fusing.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s --mtriple aarch64 -verify-machineinstrs -o - | FileCheck %s
+
+define dso_local void @jsimd_idct_ifast_neon_intrinsic(i8* nocapture readonly %dct_table, i16* nocapture readonly %coef_block, i8** nocapture readonly %output_buf, i32 %output_col) local_unnamed_addr #0 {
+; CHECK-LABEL: jsimd_idct_ifast_neon_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr q0, [x1, #32]
+; CHECK-NEXT:    ldr q1, [x1, #96]
+; CHECK-NEXT:    ldr q2, [x0, #32]
+; CHECK-NEXT:    ldr q3, [x0, #96]
+; CHECK-NEXT:    ldr x8, [x2, #48]
+; CHECK-NEXT:    mov w9, w3
+; CHECK-NEXT:    mul v0.8h, v2.8h, v0.8h
+; CHECK-NEXT:    mul v1.8h, v3.8h, v1.8h
+; CHECK-NEXT:    add v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    str q2, [x8, x9]
+; CHECK-NEXT:    ldr x8, [x2, #56]
+; CHECK-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    str q0, [x8, x9]
+; CHECK-NEXT:    ret
+entry:
+  %add.ptr5 = getelementptr inbounds i16, i16* %coef_block, i64 16
+  %0 = bitcast i16* %add.ptr5 to <8 x i16>*
+  %1 = load <8 x i16>, <8 x i16>* %0, align 16
+
+  %add.ptr17 = getelementptr inbounds i16, i16* %coef_block, i64 48
+  %2 = bitcast i16* %add.ptr17 to <8 x i16>*
+  %3 = load <8 x i16>, <8 x i16>* %2, align 16
+
+  %add.ptr29 = getelementptr inbounds i8, i8* %dct_table, i64 32
+  %4 = bitcast i8* %add.ptr29 to <8 x i16>*
+  %5 = load <8 x i16>, <8 x i16>* %4, align 16
+
+  %add.ptr41 = getelementptr inbounds i8, i8* %dct_table, i64 96
+  %6 = bitcast i8* %add.ptr41 to <8 x i16>*
+  %7 = load <8 x i16>, <8 x i16>* %6, align 16
+
+  %mul.i966 = mul <8 x i16> %5, %1
+  %mul.i964 = mul <8 x i16> %7, %3
+
+  %add.i961 = add <8 x i16> %mul.i966, %mul.i964
+  %sub.i960 = sub <8 x i16> %mul.i966, %mul.i964
+
+  %idx.ext = zext i32 %output_col to i64
+
+  %arrayidx404 = getelementptr inbounds i8*, i8** %output_buf, i64 6
+  %8 = load i8*, i8** %arrayidx404, align 8
+  %add.ptr406 = getelementptr inbounds i8, i8* %8, i64 %idx.ext
+  %9 = bitcast i8* %add.ptr406 to <8 x i16>*
+  store <8 x i16> %add.i961, <8 x i16>* %9, align 8
+
+  %arrayidx408 = getelementptr inbounds i8*, i8** %output_buf, i64 7
+  %10 = load i8*, i8** %arrayidx408, align 8
+  %add.ptr410 = getelementptr inbounds i8, i8* %10, i64 %idx.ext
+  %11 = bitcast i8* %add.ptr410 to <8 x i16>*
+  store <8 x i16> %sub.i960, <8 x i16>* %11, align 8
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-int-log-pred.ll b/llvm/test/CodeGen/AArch64/sve-int-log-pred.ll
index 5e12981fd67e3..ad6dc9c2d23a6 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-log-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-log-pred.ll
@@ -125,6 +125,46 @@ define <vscale x 2 x i64> @xor_pred_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64
   ret <vscale x 2 x i64> %out
 }
 
+define <vscale x 16 x i8> @bic_pred_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: bic_pred_i8:
+; CHECK: bic z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv2i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @bic_pred_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: bic_pred_i16:
+; CHECK: bic z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv2i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+
+define <vscale x 4 x i32> @bic_pred_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: bic_pred_i32:
+; CHECK: bic z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv2i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @bic_pred_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: bic_pred_i64:
+; CHECK: bic z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
 
 declare <vscale x 16 x i8> @llvm.aarch64.sve.and.nxv2i8(<vscale x 16 x i1>,<vscale x 16 x i8>,<vscale x 16 x i8>)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.and.nxv2i16(<vscale x 8 x i1>,<vscale x 8 x i16>,<vscale x 8 x i16>)
@@ -138,3 +178,7 @@ declare <vscale x 16 x i8> @llvm.aarch64.sve.xor.nxv2i8(<vscale x 16 x i1>,<vsca
 declare <vscale x 8 x i16> @llvm.aarch64.sve.xor.nxv2i16(<vscale x 8 x i1>,<vscale x 8 x i16>,<vscale x 8 x i16>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.xor.nxv2i32(<vscale x 4 x i1>,<vscale x 4 x i32>,<vscale x 4 x i32>)
 declare <vscale x 2 x i64> @llvm.aarch64.sve.xor.nxv2i64(<vscale x 2 x i1>,<vscale x 2 x i64>,<vscale x 2 x i64>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv2i8(<vscale x 16 x i1>,<vscale x 16 x i8>,<vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv2i16(<vscale x 8 x i1>,<vscale x 8 x i16>,<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv2i32(<vscale x 4 x i1>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i1>,<vscale x 2 x i64>,<vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-log.ll b/llvm/test/CodeGen/AArch64/sve-int-log.ll
index cdd562823bf7f..3c45d0511f7a8 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-log.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-log.ll
@@ -99,8 +99,8 @@ define <vscale x 2 x i64> @bic_d(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: bic_d
 ; CHECK: bic z0.d, z0.d, z1.d
 ; CHECK-NEXT: ret
-  %res =  call <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i64> %a,
-                                                                <vscale x 2 x i64> %b)
+  %res =  call <vscale x 2 x i64> @llvm.aarch64.sve.bic.base.nxv2i64(<vscale x 2 x i64> %a,
+                                                                     <vscale x 2 x i64> %b)
   ret <vscale x 2 x i64> %res
 }
 
@@ -108,8 +108,8 @@ define <vscale x 4 x i32> @bic_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: bic_s
 ; CHECK: bic z0.d, z0.d, z1.d
 ; CHECK-NEXT: ret
-  %res =  call <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv4i32(<vscale x 4 x i32> %a,
-                                                                <vscale x 4 x i32> %b)
+  %res =  call <vscale x 4 x i32> @llvm.aarch64.sve.bic.base.nxv4i32(<vscale x 4 x i32> %a,
+                                                                     <vscale x 4 x i32> %b)
   ret <vscale x 4 x i32> %res
 }
 
@@ -117,8 +117,8 @@ define <vscale x 8 x i16> @bic_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: bic_h
 ; CHECK: bic z0.d, z0.d, z1.d
 ; CHECK-NEXT: ret
-  %res =  call <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i16> %a,
-                                                                <vscale x 8 x i16> %b)
+  %res =  call <vscale x 8 x i16> @llvm.aarch64.sve.bic.base.nxv8i16(<vscale x 8 x i16> %a,
+                                                                     <vscale x 8 x i16> %b)
 
   ret <vscale x 8 x i16> %res
 }
@@ -127,12 +127,12 @@ define <vscale x 16 x i8> @bic_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: bic_b
 ; CHECK: bic z0.d, z0.d, z1.d
 ; CHECK-NEXT: ret
-  %res =  call <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv16i8(<vscale x 16 x i8> %a,
-                                                                <vscale x 16 x i8> %b)
+  %res =  call <vscale x 16 x i8> @llvm.aarch64.sve.bic.base.nxv16i8(<vscale x 16 x i8> %a,
+                                                                     <vscale x 16 x i8> %b)
   ret <vscale x 16 x i8> %res
 }
 
-declare <vscale x 2 x i64> @llvm.aarch64.sve.bic.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
-declare <vscale x 4 x i32> @llvm.aarch64.sve.bic.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 16 x i8> @llvm.aarch64.sve.bic.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.bic.base.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.bic.base.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.bic.base.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.aarch64.sve.bic.base.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll
new file mode 100644
index 0000000000000..a3fd4faf196f0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; CNTB
+;
+
+define i64 @cntb() {
+; CHECK-LABEL: cntb:
+; CHECK: cntb x0, vl2
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntb(i32 2)
+  ret i64 %out
+}
+
+;
+; CNTH
+;
+
+define i64 @cnth() {
+; CHECK-LABEL: cnth:
+; CHECK: cnth x0, vl3
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cnth(i32 3)
+  ret i64 %out
+}
+
+;
+; CNTW
+;
+
+define i64 @cntw() {
+; CHECK-LABEL: cntw:
+; CHECK: cntw x0, vl4
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntw(i32 4)
+  ret i64 %out
+}
+
+;
+; CNTD
+;
+
+define i64 @cntd() {
+; CHECK-LABEL: cntd:
+; CHECK: cntd x0, vl5
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntd(i32 5)
+  ret i64 %out
+}
+
+;
+; CNTP
+;
+
+define i64 @cntp_b8(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; CHECK-LABEL: cntp_b8:
+; CHECK: cntp x0, p0, p1.b
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1> %pg,
+                                                 <vscale x 16 x i1> %a)
+  ret i64 %out
+}
+
+define i64 @cntp_b16(<vscale x 8 x i1> %pg, <vscale x 8 x i1> %a) {
+; CHECK-LABEL: cntp_b16:
+; CHECK: cntp x0, p0, p1.h
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1> %pg,
+                                                <vscale x 8 x i1> %a)
+  ret i64 %out
+}
+
+define i64 @cntp_b32(<vscale x 4 x i1> %pg, <vscale x 4 x i1> %a) {
+; CHECK-LABEL: cntp_b32:
+; CHECK: cntp x0, p0, p1.s
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> %pg,
+                                                <vscale x 4 x i1> %a)
+  ret i64 %out
+}
+
+define i64 @cntp_b64(<vscale x 2 x i1> %pg, <vscale x 2 x i1> %a) {
+; CHECK-LABEL: cntp_b64:
+; CHECK: cntp x0, p0, p1.d
+; CHECK-NEXT: ret
+  %out = call i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1> %pg,
+                                                <vscale x 2 x i1> %a)
+  ret i64 %out
+}
+
+declare i64 @llvm.aarch64.sve.cntb(i32 %pattern)
+declare i64 @llvm.aarch64.sve.cnth(i32 %pattern)
+declare i64 @llvm.aarch64.sve.cntw(i32 %pattern)
+declare i64 @llvm.aarch64.sve.cntd(i32 %pattern)
+
+declare i64 @llvm.aarch64.sve.cntp.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>)
+declare i64 @llvm.aarch64.sve.cntp.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll
new file mode 100644
index 0000000000000..e777a2f3b8b04
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-fp-converts.ll
@@ -0,0 +1,400 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; FCVT
+;
+
+define <vscale x 8 x half> @fcvt_f16_f32(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvt_f16_f32:
+; CHECK: fcvt z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half> %a,
+                                                                <vscale x 16 x i1> %pg,
+                                                                <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @fcvt_f16_f64(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvt_f16_f64:
+; CHECK: fcvt z0.h, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half> %a,
+                                                                <vscale x 16 x i1> %pg,
+                                                                <vscale x 2 x double> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fcvt_f32_f16(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvt_f32_f16:
+; CHECK: fcvt z0.s, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 4 x float> @fcvt_f32_f64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvt_f32_f64:
+; CHECK: fcvt z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fcvt_f64_f16(<vscale x 2 x double> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvt_f64_f16:
+; CHECK: fcvt z0.d, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double> %a,
+                                                                  <vscale x 16 x i1> %pg,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 2 x double> @fcvt_f64_f32(<vscale x 2 x double> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvt_f64_f32:
+; CHECK: fcvt z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double> %a,
+                                                                  <vscale x 16 x i1> %pg,
+                                                                  <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FCVTZS
+;
+
+define <vscale x 8 x i16> @fcvtzs_i16_f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzs_i16_f16:
+; CHECK: fcvtzs z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16> %a,
+                                                                          <vscale x 8 x i1> %pg,
+                                                                          <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @fcvtzs_i32_f32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtzs_i32_f32:
+; CHECK: fcvtzs z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32> %a,
+                                                                          <vscale x 4 x i1> %pg,
+                                                                          <vscale x 4 x float> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @fcvtzs_i64_f64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtzs_i64_f64:
+; CHECK: fcvtzs z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64> %a,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x double> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 4 x i32> @fcvtzs_i32_f16(<vscale x 4 x i32> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzs_i32_f16:
+; CHECK: fcvtzs z0.s, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @fcvtzs_i32_f64(<vscale x 4 x i32> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtzs_i32_f64:
+; CHECK: fcvtzs z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @fcvtzs_i64_f16(<vscale x 2 x i64> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzs_i64_f16:
+; CHECK: fcvtzs z0.d, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @fcvtzs_i64_f32(<vscale x 2 x i64> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtzs_i64_f32:
+; CHECK: fcvtzs z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; FCVTZU
+;
+
+define <vscale x 8 x i16> @fcvtzu_i16_f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzu_i16_f16:
+; CHECK: fcvtzu z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16> %a,
+                                                                          <vscale x 8 x i1> %pg,
+                                                                          <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @fcvtzu_i32_f32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtzu_i32_f32:
+; CHECK: fcvtzu z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32> %a,
+                                                                          <vscale x 4 x i1> %pg,
+                                                                          <vscale x 4 x float> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @fcvtzu_i64_f64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtzu_i64_f64:
+; CHECK: fcvtzu z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64> %a,
+                                                                          <vscale x 2 x i1> %pg,
+                                                                          <vscale x 2 x double> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 4 x i32> @fcvtzu_i32_f16(<vscale x 4 x i32> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzu_i32_f16:
+; CHECK: fcvtzu z0.s, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 4 x i32> @fcvtzu_i32_f64(<vscale x 4 x i32> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtzu_i32_f64:
+; CHECK: fcvtzu z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 2 x double> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @fcvtzu_i64_f16(<vscale x 2 x i64> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtzu_i64_f16:
+; CHECK: fcvtzu z0.d, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 2 x i64> @fcvtzu_i64_f32(<vscale x 2 x i64> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtzu_i64_f32:
+; CHECK: fcvtzu z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 4 x float> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; SCVTF
+;
+
+define <vscale x 8 x half> @scvtf_f16_i16(<vscale x 8 x half> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: scvtf_f16_i16:
+; CHECK: scvtf z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a,
+                                                                          <vscale x 8 x i1> %pg,
+                                                                          <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @scvtf_f32_i32(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: scvtf_f32_i32:
+; CHECK: scvtf z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.nxv4f32.nxv4i32(<vscale x 4 x float> %a,
+                                                                           <vscale x 4 x i1> %pg,
+                                                                           <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @scvtf_f64_i64(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: scvtf_f64_i64:
+; CHECK: scvtf z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.nxv2f64.nxv2i64(<vscale x 2 x double> %a,
+                                                                            <vscale x 2 x i1> %pg,
+                                                                            <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 8 x half> @scvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: scvtf_f16_i32:
+; CHECK: scvtf z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @scvtf_f16_i64(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: scvtf_f16_i64:
+; CHECK: scvtf z0.h, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @scvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: scvtf_f32_i64:
+; CHECK: scvtf z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float> %a,
+                                                                  <vscale x 16 x i1> %pg,
+                                                                  <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @scvtf_f64_i32(<vscale x 2 x double> %a, <vscale x 16 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: scvtf_f64_i32:
+; CHECK: scvtf z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double> %a,
+                                                                   <vscale x 16 x i1> %pg,
+                                                                   <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; UCVTF
+;
+
+define <vscale x 8 x half> @ucvtf_f16_i16(<vscale x 8 x half> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: ucvtf_f16_i16:
+; CHECK: ucvtf z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half> %a,
+                                                                          <vscale x 8 x i1> %pg,
+                                                                          <vscale x 8 x i16> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @ucvtf_f32_i32(<vscale x 4 x float> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ucvtf_f32_i32:
+; CHECK: ucvtf z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.nxv4f32.nxv4i32(<vscale x 4 x float> %a,
+                                                                           <vscale x 4 x i1> %pg,
+                                                                           <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @ucvtf_f64_i64(<vscale x 2 x double> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: ucvtf_f64_i64:
+; CHECK: ucvtf z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.nxv2f64.nxv2i64(<vscale x 2 x double> %a,
+                                                                            <vscale x 2 x i1> %pg,
+                                                                            <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %out
+}
+
+define <vscale x 8 x half> @ucvtf_f16_i32(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ucvtf_f16_i32:
+; CHECK: ucvtf z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 4 x i32> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 8 x half> @ucvtf_f16_i64(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: ucvtf_f16_i64:
+; CHECK: ucvtf z0.h, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half> %a,
+                                                                 <vscale x 16 x i1> %pg,
+                                                                 <vscale x 2 x i64> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @ucvtf_f32_i64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: ucvtf_f32_i64:
+; CHECK: ucvtf z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float> %a,
+                                                                  <vscale x 16 x i1> %pg,
+                                                                  <vscale x 2 x i64> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @ucvtf_f64_i32(<vscale x 2 x double> %a, <vscale x 16 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: ucvtf_f64_i32:
+; CHECK: ucvtf z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double> %a,
+                                                                   <vscale x 16 x i1> %pg,
+                                                                   <vscale x 4 x i32> %b)
+  ret <vscale x 2 x double> %out
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f32(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 4 x float>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.f16f64(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f16(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvt.f32f64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f16(<vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fcvt.f64f32(<vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 4 x float>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzs.nxv8i16.nxv8f16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.nxv4i32.nxv4f32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x float>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.nxv2i64.nxv2f64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f16(<vscale x 4 x i32>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzs.i32f64(<vscale x 4 x i32>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f16(<vscale x 2 x i64>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzs.i64f32(<vscale x 2 x i64>, <vscale x 16 x i1>, <vscale x 4 x float>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.fcvtzu.nxv8i16.nxv8f16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.nxv4i32.nxv4f32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x float>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.nxv2i64.nxv2f64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f16(<vscale x 4 x i32>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.fcvtzu.i32f64(<vscale x 4 x i32>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f16(<vscale x 2 x i64>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.fcvtzu.i64f32(<vscale x 2 x i64>, <vscale x 16 x i1>, <vscale x 4 x float>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.scvtf.nxv8f16.nxv8i16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.scvtf.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.scvtf.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i32(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 4 x i32>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.scvtf.f16i64(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 2 x i64>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.scvtf.f32i64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.scvtf.f64i32(<vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 4 x i32>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.nxv8f16.nxv8i16(<vscale x 8 x half>, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i32(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 4 x i32>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.ucvtf.f16i64(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 2 x i64>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ucvtf.f32i64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ucvtf.f64i32(<vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
new file mode 100644
index 0000000000000..74241389d3ac2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-scaled-offsets.ll
@@ -0,0 +1,198 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1H, LD1W, LD1D: base + 32-bit scaled offset, sign (sxtw) or zero (uxtw)
+; extended to 64 bits
+;   e.g. ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
+;
+
+; LD1H
+define <vscale x 4 x i32> @gld1h_s_uxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1h_s_uxtw_index:
+; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.s, w8
+; CHECK-NEXT: and	z0.d, z0.d, z1.d
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                          i16* %base,
+                                                                                          <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gld1h_s_sxtw_index(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1h_s_sxtw_index:
+; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT:	mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                          i16* %base,
+                                                                                          <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_uxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_uxtw_index:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i16* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_sxtw_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_sxtw_index:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i16* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LD1W
+define <vscale x 4 x i32> @gld1w_s_uxtw_index(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_uxtw_index:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                          i32* %base,
+                                                                                          <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @gld1w_s_sxtw_index(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_sxtw_index:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                          i32* %base,
+                                                                                          <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gld1w_d_uxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_d_uxtw_index:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT: mov	w8, #-1
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i32* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_sxtw_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_d_sxtw_index:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT: mov	w8, #-1
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i32* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gld1w_s_uxtw_index_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_uxtw_index_float:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                            float* %base,
+                                                                                            <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @gld1w_s_sxtw_index_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_sxtw_index_float:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                            float* %base,
+                                                                                            <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %load
+}
+
+; LD1D
+define <vscale x 2 x i64> @gld1d_s_uxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_s_uxtw_index:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i64* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x i64> @gld1d_sxtw_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_sxtw_index:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                          i64* %base,
+                                                                                          <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_uxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_uxtw_index_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                             double* %base,
+                                                                                             <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+define <vscale x 2 x double> @gld1d_sxtw_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_sxtw_index_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                             double* %base,
+                                                                                             <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+; LD1H
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+
+; LD1W
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; LD1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.index.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
new file mode 100644
index 0000000000000..a4d26f29a9db3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-32bit-unscaled-offsets.ll
@@ -0,0 +1,259 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1B, LD1W, LD1H, LD1D: base + 32-bit unscaled offset, sign (sxtw) or zero
+; (uxtw) extended to 64 bits.
+;   e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+;
+
+; LD1B
+define <vscale x 4 x i32> @gld1b_s_uxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1b_s_uxtw:
+; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: mov	w8, #255
+; CHECK-NEXT: mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i8* %base,
+                                                                                  <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gld1b_s_sxtw(<vscale x 4 x i1> %pg, i8* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1b_s_sxtw:
+; CHECK: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:	mov	w8, #255
+; CHECK-NEXT:	mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                  i8* %base,
+                                                                                  <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gld1b_d_uxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_uxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:	mov	w8, #255
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i8* %base,
+                                                                                  <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1b_d_sxtw(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d_sxtw:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:	mov	w8, #255
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                  i8* %base,
+                                                                                  <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LD1H
+define <vscale x 4 x i32> @gld1h_s_uxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1h_s_uxtw:
+; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.s, w8
+; CHECK-NEXT: and	z0.d, z0.d, z1.d
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                    i16* %base,
+                                                                                    <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 4 x i32> @gld1h_s_sxtw(<vscale x 4 x i1> %pg, i16* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1h_s_sxtw:
+; CHECK: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT:	mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                    i16* %base,
+                                                                                    <vscale x 4 x i32> %b)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_uxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_uxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i16* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_sxtw(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d_sxtw:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i16* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LD1W
+define <vscale x 4 x i32> @gld1w_s_uxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_uxtw:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                    i32* %base,
+                                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x i32> @gld1w_s_sxtw(<vscale x 4 x i1> %pg, i32* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_sxtw:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                    i32* %base,
+                                                                                    <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gld1w_d_uxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_d_uxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: mov	w8, #-1
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i32* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d_sxtw(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_d_sxtw:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:	mov	w8, #-1
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i32* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gld1w_s_uxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_uxtw_float:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                      float* %base,
+                                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 4 x float> @gld1w_s_sxtw_float(<vscale x 4 x i1> %pg, float* %base, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: gld1w_s_sxtw_float:
+; CHECK: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                      float* %base,
+                                                                                      <vscale x 4 x i32> %b)
+  ret <vscale x 4 x float> %load
+}
+
+; LD1D
+define <vscale x 2 x i64> @gld1d_d_uxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_uxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i64* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x i64> @gld1d_d_sxtw(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_sxtw:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    i64* %base,
+                                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_uxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_uxtw_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                       double* %base,
+                                                                                       <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_sxtw_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_sxtw_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                       double* %base,
+                                                                                       <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+; LD1B
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i8.nxv4i32(<vscale x 4 x i1>, i8*, <vscale x 4 x i32>)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i8.nxv2i64(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+
+; LD1H
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i16.nxv4i32(<vscale x 4 x i1>, i16*, <vscale x 4 x i32>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i16.nxv2i64(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+
+; LD1W
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4i32.nxv4i32(<vscale x 4 x i1>, i32*, <vscale x 4 x i32>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i32.nxv2i64(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.sxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.uxtw.nxv4f32.nxv4i32(<vscale x 4 x i1>, float*, <vscale x 4 x i32>)
+
+; LD1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2i64.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.sxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.uxtw.nxv2f64.nxv2i64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
new file mode 100644
index 0000000000000..274eaad0eef1d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1H, LD1W, LD1D: base + 64-bit scaled offset
+;   e.g. ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
+;
+
+define <vscale x 2 x i64> @gld1h_index(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_index
+; CHECK:	    ld1h	{ z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:	mov	w8, #65535
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                             i16* %base,
+                                                                             <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_index(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1w_index
+; CHECK:	    ld1w	{ z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:	mov	w8, #-1
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                             i32* %base,
+                                                                             <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_index(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                             i64* %base,
+                                                                             <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_index_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_index_double
+; CHECK:	    ld1d	{ z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:	ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                                double* %base,
+                                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.index.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.index.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.index.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.index.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
new file mode 100644
index 0000000000000..9a8df453b336f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1B, LD1W, LD1H, LD1D: base + 64-bit unscaled offset
+;   e.g. ld1h { z0.d }, p0/z, [x0, z0.d]
+;
+
+define <vscale x 2 x i64> @gld1b_d(<vscale x 2 x i1> %pg, i8* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1b_d:
+; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:	mov	w8, #255
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT: and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1> %pg,
+                                                                     i8* %base,
+                                                                     <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d(<vscale x 2 x i1> %pg, i16* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1h_d:
+; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:	mov	w8, #65535
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT: and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1> %pg,
+                                                                       i16* %base,
+                                                                       <vscale x 2 x i64> %b)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1w_d(<vscale x 2 x i1> %pg, i32* %base, <vscale x 2 x i64> %offsets) {
+; CHECK-LABEL: gld1w_d:
+; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:	mov	w8, #-1
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT: and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1> %pg,
+                                                                       i32* %base,
+                                                                       <vscale x 2 x i64> %offsets)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 2 x i64> @gld1d_d(<vscale x 2 x i1> %pg, i64* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                       i64* %base,
+                                                                       <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_double(<vscale x 2 x i1> %pg, double* %base, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: gld1d_d_double:
+; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                       double* %base,
+                                                                       <vscale x 2 x i64> %b)
+  ret <vscale x 2 x double> %load
+}
+
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.nxv2i8(<vscale x 2 x i1>, i8*, <vscale x 2 x i64>)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.nxv2i16(<vscale x 2 x i1>, i16*, <vscale x 2 x i64>)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.nxv2i32(<vscale x 2 x i1>, i32*, <vscale x 2 x i64>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.nxv2i64(<vscale x 2 x i1>, i64*, <vscale x 2 x i64>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.nxv2f64(<vscale x 2 x i1>, double*, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-vector-base.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-vector-base.ll
new file mode 100644
index 0000000000000..42d9f86302456
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-vector-base.ll
@@ -0,0 +1,139 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; LD1B, LD1W, LD1H, LD1D: vector + immediate (index)
+;   e.g. ld1h { z0.s }, p0/z, [z0.s, #16]
+;
+
+; LD1B
+define <vscale x 4 x i32> @gld1b_s_imm(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gld1b_s_imm:
+; CHECK: ld1b { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: mov	w8, #255
+; CHECK-NEXT: mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.imm.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                 <vscale x 4 x i32> %base,
+                                                                                 i64 16)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gld1b_d_imm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gld1b_d_imm:
+; CHECK: ld1b { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: mov	w8, #255
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.imm.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                           <vscale x 2 x i64> %base,
+                                                                           i64 16)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LD1H
+define <vscale x 4 x i32> @gld1h_s_imm(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gld1h_s_imm:
+; CHECK: ld1h { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.s, w8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.imm.nxv4i16.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            <vscale x 4 x i32> %base,
+                                                                            i64 16)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gld1h_d_imm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gld1h_d_imm:
+; CHECK: ld1h { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: mov	w8, #65535
+; CHECK-NEXT: mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.imm.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                           <vscale x 2 x i64> %base,
+                                                                           i64 16)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LD1W
+define <vscale x 4 x i32> @gld1w_s_imm(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gld1w_s_imm:
+; CHECK: ld1w { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.imm.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                            <vscale x 4 x i32> %base,
+                                                                            i64 16)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 2 x i64> @gld1w_d_imm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gld1w_d_imm:
+; CHECK: ld1w { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT:	mov	w8, #-1
+; CHECK-NEXT:	mov	z1.d, x8
+; CHECK-NEXT:	and	z0.d, z0.d, z1.d
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.imm.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                   <vscale x 2 x i64> %base,
+                                                                                   i64 16)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 4 x float> @gld1w_s_imm_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base) {
+; CHECK-LABEL: gld1w_s_imm_float:
+; CHECK: ld1w { z0.s }, p0/z, [z0.s, #16]
+; CHECK-NEXT: ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.imm.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                     <vscale x 4 x i32> %base,
+                                                                                     i64 16)
+  ret <vscale x 4 x float> %load
+}
+
+; LD1D
+define <vscale x 2 x i64> @gld1d_d_imm(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gld1d_d_imm:
+; CHECK: ld1d { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.imm.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                   <vscale x 2 x i64> %base,
+                                                                                   i64 16)
+  ret <vscale x 2 x i64> %load
+}
+
+define <vscale x 2 x double> @gld1d_d_imm_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base) {
+; CHECK-LABEL: gld1d_d_imm_double:
+; CHECK: ld1d { z0.d }, p0/z, [z0.d, #16]
+; CHECK-NEXT: ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.imm.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                      <vscale x 2 x i64> %base,
+                                                                                      i64 16)
+  ret <vscale x 2 x double> %load
+}
+
+; LD1B
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ld1.gather.imm.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ld1.gather.imm.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LD1H
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ld1.gather.imm.nxv4i16.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ld1.gather.imm.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LD1W
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1.gather.imm.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ld1.gather.imm.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.ld1.gather.imm.nxv4f32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; LD1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1.gather.imm.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ld1.gather.imm.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll
new file mode 100644
index 0000000000000..69adf7fc68380
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-reversal.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; RBIT
+;
+
+define <vscale x 16 x i8> @rbit_i8(<vscale x 16 x i8> %a, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: rbit_i8:
+; CHECK: rbit z0.b, p0/m, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> %a,
+                                                                <vscale x 16 x i1> %pg,
+                                                                <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @rbit_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: rbit_i16:
+; CHECK: rbit z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> %a,
+                                                                <vscale x 8 x i1> %pg,
+                                                                <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @rbit_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: rbit_i32:
+; CHECK: rbit z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> %a,
+                                                                <vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @rbit_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: rbit_i64:
+; CHECK: rbit z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> %a,
+                                                                <vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; REVB
+;
+
+define <vscale x 8 x i16> @revb_i16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: revb_i16:
+; CHECK: revb z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                <vscale x 8 x i1> %pg,
+                                                                <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @revb_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: revb_i32:
+; CHECK: revb z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                <vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @revb_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: revb_i64:
+; CHECK: revb z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                <vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; REVB (bswap)
+;
+
+define <vscale x 8 x i16> @revb_i16_bswap(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: revb_i16_bswap:
+; CHECK: ptrue [[PG:p[0-9]+]].h
+; CHECK-NEXT: revb z0.h, [[PG]]/m, z0.h
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i16> @llvm.bswap.nxv8i16(<vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @revb_i32_bswap(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: revb_i32_bswap:
+; CHECK: ptrue [[PG:p[0-9]+]].s
+; CHECK-NEXT: revb z0.s, [[PG]]/m, z0.s
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i32> @llvm.bswap.nxv4i32(<vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @revb_i64_bswap(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: revb_i64_bswap:
+; CHECK: ptrue [[PG:p[0-9]+]].d
+; CHECK-NEXT: revb z0.d, [[PG]]/m, z0.d
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i64> @llvm.bswap.nxv2i64(<vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %res
+}
+
+;
+; REVH
+;
+
+define <vscale x 4 x i32> @revh_i32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: revh_i32:
+; CHECK: revh z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> %a,
+                                                                <vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @revh_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: revh_i64:
+; CHECK: revh z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> %a,
+                                                                <vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; REVW
+;
+
+define <vscale x 2 x i64> @revw_i64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: revw_i64:
+; CHECK: revw z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> %a,
+                                                                <vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+
+declare <vscale x 8 x i16> @llvm.bswap.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.bswap.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.bswap.nxv2i64(<vscale x 2 x i64>)
+
+declare <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
+
+declare <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
new file mode 100644
index 0000000000000..b1b3dc61560b4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts.ll
@@ -0,0 +1,367 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; ASR
+;
+
+define <vscale x 16 x i8> @asr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: asr_i8:
+; CHECK: asr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: asr_i16:
+; CHECK: asr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: asr_i32:
+; CHECK: asr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @asr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: asr_i64:
+; CHECK: asr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 16 x i8> @asr_wide_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: asr_wide_i8:
+; CHECK: asr z0.b, p0/m, z0.b, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asr_wide_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: asr_wide_i16:
+; CHECK: asr z0.h, p0/m, z0.h, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asr_wide_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: asr_wide_i32:
+; CHECK: asr z0.s, p0/m, z0.s, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; ASRD
+;
+
+define <vscale x 16 x i8> @asrd_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a) {
+; CHECK-LABEL: asrd_i8:
+; CHECK: asrd z0.b, p0/m, z0.b, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                <vscale x 16 x i8> %a,
+                                                                i32 1)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @asrd_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a) {
+; CHECK-LABEL: asrd_i16:
+; CHECK: asrd z0.h, p0/m, z0.h, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.asrd.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                <vscale x 8 x i16> %a,
+                                                                i32 2)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @asrd_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a) {
+; CHECK-LABEL: asrd_i32:
+; CHECK: asrd z0.s, p0/m, z0.s, #31
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.asrd.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                <vscale x 4 x i32> %a,
+                                                                i32 31)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @asrd_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) {
+; CHECK-LABEL: asrd_i64:
+; CHECK: asrd z0.d, p0/m, z0.d, #64
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.asrd.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                <vscale x 2 x i64> %a,
+                                                                i32 64)
+  ret <vscale x 2 x i64> %out
+}
+
+;
+; INSR
+;
+
+define <vscale x 16 x i8> @insr_i8(<vscale x 16 x i8> %a, i8 %b) {
+; CHECK-LABEL: insr_i8:
+; CHECK: insr z0.b, w0
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8> %a, i8 %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @insr_i16(<vscale x 8 x i16> %a, i16 %b) {
+; CHECK-LABEL: insr_i16:
+; CHECK: insr z0.h, w0
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16> %a, i16 %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @insr_i32(<vscale x 4 x i32> %a, i32 %b) {
+; CHECK-LABEL: insr_i32:
+; CHECK: insr z0.s, w0
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.insr.nxv4i32(<vscale x 4 x i32> %a, i32 %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @insr_i64(<vscale x 2 x i64> %a, i64 %b) {
+; CHECK-LABEL: insr_i64:
+; CHECK: insr z0.d, x0
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.insr.nxv2i64(<vscale x 2 x i64> %a, i64 %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 8 x half> @insr_f16(<vscale x 8 x half> %a, half %b) {
+; CHECK-LABEL: insr_f16:
+; CHECK: insr z0.h, h1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.insr.nxv8f16(<vscale x 8 x half> %a, half %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @insr_f32(<vscale x 4 x float> %a, float %b) {
+; CHECK-LABEL: insr_f32:
+; CHECK: insr z0.s, s1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float> %a, float %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @insr_f64(<vscale x 2 x double> %a, double %b) {
+; CHECK-LABEL: insr_f64:
+; CHECK: insr z0.d, d1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double> %a, double %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; LSL
+;
+
+define <vscale x 16 x i8> @lsl_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: lsl_i8:
+; CHECK: lsl z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: lsl_i16:
+; CHECK: lsl z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: lsl_i32:
+; CHECK: lsl z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsl_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsl_i64:
+; CHECK: lsl z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 16 x i8> @lsl_wide_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsl_wide_i8:
+; CHECK: lsl z0.b, p0/m, z0.b, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsl_wide_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsl_wide_i16:
+; CHECK: lsl z0.h, p0/m, z0.h, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsl_wide_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsl_wide_i32:
+; CHECK: lsl z0.s, p0/m, z0.s, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; LSR
+;
+
+define <vscale x 16 x i8> @lsr_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: lsr_i8:
+; CHECK: lsr z0.b, p0/m, z0.b, z1.b
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1> %pg,
+                                                               <vscale x 16 x i8> %a,
+                                                               <vscale x 16 x i8> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: lsr_i16:
+; CHECK: lsr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1> %pg,
+                                                               <vscale x 8 x i16> %a,
+                                                               <vscale x 8 x i16> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: lsr_i32:
+; CHECK: lsr z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1> %pg,
+                                                               <vscale x 4 x i32> %a,
+                                                               <vscale x 4 x i32> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsr_i64:
+; CHECK: lsr z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1> %pg,
+                                                               <vscale x 2 x i64> %a,
+                                                               <vscale x 2 x i64> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+define <vscale x 16 x i8> @lsr_wide_i8(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsr_wide_i8:
+; CHECK: lsr z0.b, p0/m, z0.b, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1> %pg,
+                                                                    <vscale x 16 x i8> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @lsr_wide_i16(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsr_wide_i16:
+; CHECK: lsr z0.h, p0/m, z0.h, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x i16> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @lsr_wide_i32(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: lsr_wide_i32:
+; CHECK: lsr z0.s, p0/m, z0.s, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                    <vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.asr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.asr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.asr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.asr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.asrd.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.asrd.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.asrd.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.asrd.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8>, i8)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16>, i16)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.insr.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.insr.nxv2i64(<vscale x 2 x i64>, i64)
+declare <vscale x 8 x half> @llvm.aarch64.sve.insr.nxv8f16(<vscale x 8 x half>, half)
+declare <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float>, float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double>, double)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.lsl.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsl.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsl.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsl.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.lsr.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.lsr.wide.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 2 x i64>)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.lsr.wide.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 2 x i64>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.lsr.wide.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
new file mode 100644
index 0000000000000..0590c74d2efc9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-while.ll
@@ -0,0 +1,309 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;
+; WHILELE
+;
+
+define <vscale x 16 x i1> @whilele_b_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilele_b_ww:
+; CHECK: whilele p0.b, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i32(i32 %a, i32 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @whilele_b_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilele_b_xx:
+; CHECK: whilele p0.b, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64 %a, i64 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilele_h_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilele_h_ww:
+; CHECK: whilele p0.h, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i32(i32 %a, i32 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilele_h_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilele_h_xx:
+; CHECK: whilele p0.h, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i64(i64 %a, i64 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilele_s_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilele_s_ww:
+; CHECK: whilele p0.s, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i32(i32 %a, i32 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilele_s_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilele_s_xx:
+; CHECK: whilele p0.s, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i64(i64 %a, i64 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilele_d_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilele_d_ww:
+; CHECK: whilele p0.d, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i32(i32 %a, i32 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilele_d_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilele_d_xx:
+; CHECK: whilele p0.d, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i64(i64 %a, i64 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+;
+; WHILELO
+;
+
+define <vscale x 16 x i1> @whilelo_b_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelo_b_ww:
+; CHECK: whilelo p0.b, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32 %a, i32 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @whilelo_b_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelo_b_xx:
+; CHECK: whilelo p0.b, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %a, i64 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilelo_h_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelo_h_ww:
+; CHECK: whilelo p0.h, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32 %a, i32 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilelo_h_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelo_h_xx:
+; CHECK: whilelo p0.h, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %a, i64 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilelo_s_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelo_s_ww:
+; CHECK: whilelo p0.s, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32 %a, i32 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilelo_s_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelo_s_xx:
+; CHECK: whilelo p0.s, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64 %a, i64 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilelo_d_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelo_d_ww:
+; CHECK: whilelo p0.d, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32 %a, i32 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilelo_d_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelo_d_xx:
+; CHECK: whilelo p0.d, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64 %a, i64 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+;
+; WHILELS
+;
+
+define <vscale x 16 x i1> @whilels_b_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilels_b_ww:
+; CHECK: whilels p0.b, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i32(i32 %a, i32 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @whilels_b_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilels_b_xx:
+; CHECK: whilels p0.b, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64 %a, i64 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilels_h_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilels_h_ww:
+; CHECK: whilels p0.h, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i32(i32 %a, i32 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilels_h_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilels_h_xx:
+; CHECK: whilels p0.h, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i64(i64 %a, i64 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilels_s_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilels_s_ww:
+; CHECK: whilels p0.s, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i32(i32 %a, i32 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilels_s_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilels_s_xx:
+; CHECK: whilels p0.s, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64 %a, i64 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilels_d_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilels_d_ww:
+; CHECK: whilels p0.d, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i32(i32 %a, i32 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilels_d_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilels_d_xx:
+; CHECK: whilels p0.d, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i64(i64 %a, i64 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+;
+; WHILELT
+;
+
+define <vscale x 16 x i1> @whilelt_b_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelt_b_ww:
+; CHECK: whilelt p0.b, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i32(i32 %a, i32 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 16 x i1> @whilelt_b_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelt_b_xx:
+; CHECK: whilelt p0.b, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64 %a, i64 %b)
+  ret <vscale x 16 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilelt_h_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelt_h_ww:
+; CHECK: whilelt p0.h, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 %a, i32 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 8 x i1> @whilelt_h_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelt_h_xx:
+; CHECK: whilelt p0.h, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i64(i64 %a, i64 %b)
+  ret <vscale x 8 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilelt_s_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelt_s_ww:
+; CHECK: whilelt p0.s, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %a, i32 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 4 x i1> @whilelt_s_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelt_s_xx:
+; CHECK: whilelt p0.s, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64 %a, i64 %b)
+  ret <vscale x 4 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilelt_d_ww(i32 %a, i32 %b) {
+; CHECK-LABEL: whilelt_d_ww:
+; CHECK: whilelt p0.d, w0, w1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32 %a, i32 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+define <vscale x 2 x i1> @whilelt_d_xx(i64 %a, i64 %b) {
+; CHECK-LABEL: whilelt_d_xx:
+; CHECK: whilelt p0.d, x0, x1
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64 %a, i64 %b)
+  ret <vscale x 2 x i1> %out
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i32(i32, i32)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilele.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilele.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilele.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilele.nxv2i1.i64(i64, i64)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i32(i32, i32)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelo.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelo.nxv2i1.i64(i64, i64)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i32(i32, i32)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilels.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilels.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilels.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilels.nxv2i1.i64(i64, i64)
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i32(i32, i32)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilelt.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32, i32)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i64(i64, i64)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.whilelt.nxv4i1.i64(i64, i64)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i32(i32, i32)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.whilelt.nxv2i1.i64(i64, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-log.ll b/llvm/test/CodeGen/AArch64/sve-pred-log.ll
new file mode 100644
index 0000000000000..772e3f43b7c3d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-pred-log.ll
@@ -0,0 +1,545 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 16 x i1> @vselect_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: vselect_16:
+; CHECK: sel p0.b, p0, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = select <vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @vselect_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: vselect_8:
+; CHECK: sel p0.b, p0, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = select <vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @vselect_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: vselect_4:
+; CHECK: sel p0.b, p0, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = select <vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @vselect_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: vselect_2:
+; CHECK: sel p0.b, p0, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = select <vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @and_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: and_16:
+; CHECK: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.and.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @and_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: and_8:
+; CHECK: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.and.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @and_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: and_4:
+; CHECK: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.and.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @and_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: and_2:
+; CHECK: and p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.and.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+
+define <vscale x 16 x i1> @bic_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: bic_16:
+; CHECK: bic p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.bic.pred.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @bic_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: bic_8:
+; CHECK: bic p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.bic.pred.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @bic_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: bic_4:
+; CHECK: bic p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.bic.pred.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @bic_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: bic_2:
+; CHECK: bic p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.bic.pred.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @eor_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: eor_16:
+; CHECK: eor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.eor.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @eor_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: eor_8:
+; CHECK: eor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.eor.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @eor_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: eor_4:
+; CHECK: eor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.eor.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @eor_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: eor_2:
+; CHECK: eor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.eor.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @ands_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: ands_16:
+; CHECK: ands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.ands.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @ands_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: ands_8:
+; CHECK: ands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.ands.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @ands_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: ands_4:
+; CHECK: ands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.ands.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @ands_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: ands_2:
+; CHECK: ands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.ands.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+
+define <vscale x 16 x i1> @bics_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: bics_16:
+; CHECK: bics p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.bics.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @bics_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: bics_8:
+; CHECK: bics p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.bics.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @bics_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: bics_4:
+; CHECK: bics p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.bics.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @bics_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: bics_2:
+; CHECK: bics p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.bics.nxv2i1(<vscale x 2 x i1> %Pg,
+    <vscale x 2 x i1> %Pn,
+    <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+
+define <vscale x 16 x i1> @eors_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: eors_16:
+; CHECK: eors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.eors.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @eors_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: eors_8:
+; CHECK: eors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.eors.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @eors_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: eors_4:
+; CHECK: eors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.eors.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @eors_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: eors_2:
+; CHECK: eors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.eors.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+
+define <vscale x 16 x i1> @orr_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: orr_16:
+; CHECK: orr p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.orr.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @orr_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: orr_8:
+; CHECK: orr p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.orr.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @orr_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: orr_4:
+; CHECK: orr p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.orr.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @orr_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: orr_2:
+; CHECK: orr p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.orr.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+
+define <vscale x 16 x i1> @orn_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: orn_16:
+; CHECK: orn p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.orn.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @orn_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: orn_8:
+; CHECK: orn p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.orn.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @orn_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: orn_4:
+; CHECK: orn p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.orn.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @orn_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: orn_2:
+; CHECK: orn p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.orn.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @nor_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: nor_16:
+; CHECK: nor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.nor.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @nor_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: nor_8:
+; CHECK: nor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.nor.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @nor_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: nor_4:
+; CHECK: nor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.nor.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @nor_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: nor_2:
+; CHECK: nor p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.nor.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @nand_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: nand_16:
+; CHECK: nand p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.nand.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn,  <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @nand_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: nand_8:
+; CHECK: nand p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.nand.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @nand_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: nand_4:
+; CHECK: nand p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.nand.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @nand_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: nand_2:
+; CHECK: nand p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.nand.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @orrs_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: orrs_16:
+; CHECK: orrs p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.orrs.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @orrs_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: orrs_8:
+; CHECK: orrs p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.orrs.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @orrs_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: orrs_4:
+; CHECK: orrs p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.orrs.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @orrs_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: orrs_2:
+; CHECK: orrs p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.orrs.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @orns_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: orns_16:
+; CHECK: orns p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.orns.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @orns_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: orns_8:
+; CHECK: orns p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.orns.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @orns_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: orns_4:
+; CHECK: orns p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.orns.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn,  <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @orns_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: orns_2:
+; CHECK: orns p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.orns.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @nors_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: nors_16:
+; CHECK: nors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.nors.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @nors_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: nors_8:
+; CHECK: nors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.nors.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @nors_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: nors_4:
+; CHECK: nors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.nors.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @nors_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: nors_2:
+; CHECK: nors p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.nors.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+define <vscale x 16 x i1> @nands_16(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd) {
+; CHECK-LABEL: nands_16:
+; CHECK: nands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i1> @llvm.aarch64.sve.nands.nxv16i1(<vscale x 16 x i1> %Pg, <vscale x 16 x i1> %Pn, <vscale x 16 x i1> %Pd)
+  ret <vscale x 16 x i1> %res;
+}
+
+define <vscale x 8 x i1> @nands_8(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd) {
+; CHECK-LABEL: nands_8:
+; CHECK: nands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i1> @llvm.aarch64.sve.nands.nxv8i1(<vscale x 8 x i1> %Pg, <vscale x 8 x i1> %Pn, <vscale x 8 x i1> %Pd)
+  ret <vscale x 8 x i1> %res;
+}
+
+define <vscale x 4 x i1> @nands_4(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd) {
+; CHECK-LABEL: nands_4:
+; CHECK: nands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i1> @llvm.aarch64.sve.nands.nxv4i1(<vscale x 4 x i1> %Pg, <vscale x 4 x i1> %Pn, <vscale x 4 x i1> %Pd)
+  ret <vscale x 4 x i1> %res;
+}
+
+define <vscale x 2 x i1> @nands_2(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd) {
+; CHECK-LABEL: nands_2:
+; CHECK: nands p0.b, p0/z, p1.b, p2.b
+; CHECK-NEXT: ret
+  %res = call <vscale x 2 x i1> @llvm.aarch64.sve.nands.nxv2i1(<vscale x 2 x i1> %Pg, <vscale x 2 x i1> %Pn, <vscale x 2 x i1> %Pd)
+  ret <vscale x 2 x i1> %res;
+}
+
+declare <vscale x 16 x i1> @llvm.aarch64.sve.and.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.and.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.and.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.and.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.bic.pred.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.bic.pred.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.bic.pred.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.bic.pred.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.eor.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.eor.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.eor.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.eor.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.ands.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.ands.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.ands.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.ands.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.bics.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.bics.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.bics.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.bics.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.eors.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.eors.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.eors.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.eors.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orr.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.orr.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.orr.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.orr.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orn.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.orn.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.orn.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.orn.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nor.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.nor.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.nor.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.nor.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nand.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.nand.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.nand.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.nand.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orrs.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.orrs.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.orrs.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.orrs.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.orns.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.orns.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.orns.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.orns.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nors.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.nors.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.nors.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.nors.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.nands.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>)
+declare <vscale x 8 x i1> @llvm.aarch64.sve.nands.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, <vscale x 8 x i1>)
+declare <vscale x 4 x i1> @llvm.aarch64.sve.nands.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, <vscale x 4 x i1>)
+declare <vscale x 2 x i1> @llvm.aarch64.sve.nands.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, <vscale x 2 x i1>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll
new file mode 100644
index 0000000000000..4d110fee41c9f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-converts.ll
@@ -0,0 +1,84 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; FCVTLT
+;
+
+define <vscale x 4 x float> @fcvtlt_f32_f16(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fcvtlt_f32_f16:
+; CHECK: fcvtlt z0.s, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float> %a,
+                                                                   <vscale x 16 x i1> %pg,
+                                                                   <vscale x 8 x half> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fcvtlt_f64_f32(<vscale x 2 x double> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtlt_f64_f32:
+; CHECK: fcvtlt	z0.d, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double> %a,
+                                                                    <vscale x 16 x i1> %pg,
+                                                                    <vscale x 4 x float> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FCVTNT
+;
+
+define <vscale x 8 x half> @fcvtnt_f16_f32(<vscale x 8 x half> %a, <vscale x 16 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fcvtnt_f16_f32:
+; CHECK: fcvtnt z0.h, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half> %a,
+                                                             <vscale x 16 x i1> %pg,
+                                                             <vscale x 4 x float> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fcvtnt_f32_f64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtnt_f32_f64:
+; CHECK: fcvtnt	z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float> %a,
+                                                                   <vscale x 16 x i1> %pg,
+                                                                   <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FCVTX
+;
+
+define <vscale x 4 x float> @fcvtx_f32_f64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtx_f32_f64:
+; CHECK: fcvtx z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float> %a,
+                                                                  <vscale x 16 x i1> %pg,
+                                                                  <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FCVTXNT
+;
+
+define <vscale x 4 x float> @fcvtxnt_f32_f64(<vscale x 4 x float> %a, <vscale x 16 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fcvtxnt_f32_f64:
+; CHECK: fcvtxnt z0.s, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float> %a,
+                                                                    <vscale x 16 x i1> %pg,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvtlt.f32f16(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 8 x half>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fcvtlt.f64f32(<vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 4 x float>)
+declare <vscale x 8 x half> @llvm.aarch64.sve.fcvtnt.f16f32(<vscale x 8 x half>, <vscale x 16 x i1>, <vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvtnt.f32f64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvtx.f32f64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x double>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fcvtxnt.f32f64(<vscale x 4 x float>, <vscale x 16 x i1>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll
new file mode 100644
index 0000000000000..fe12324a4e0a7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-int-binary-logarithm.ll
@@ -0,0 +1,39 @@
+;RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -asm-verbose=0 < %s | FileCheck %s
+
+;
+; FLOGB
+;
+
+define <vscale x 8 x i16> @flogb_f16(<vscale x 8 x i16> %a, <vscale x 8 x i1> %pg, <vscale x 8 x half> %b) {
+; CHECK-LABEL: flogb_f16:
+; CHECK: flogb z0.h, p0/m, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> %a,
+                                                                 <vscale x 8 x i1> %pg,
+                                                                 <vscale x 8 x half> %b)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @flogb_f32(<vscale x 4 x i32> %a, <vscale x 4 x i1> %pg, <vscale x 4 x float> %b) {
+; CHECK-LABEL: flogb_f32:
+; CHECK: flogb z0.s, p0/m, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> %a,
+                                                                 <vscale x 4 x i1> %pg,
+                                                                 <vscale x 4 x float> %b)
+  ret <vscale x 4 x i32> %out
+}
+
+define <vscale x 2 x i64> @flogb_f64(<vscale x 2 x i64> %a, <vscale x 2 x i1> %pg, <vscale x 2 x double> %b) {
+; CHECK-LABEL: flogb_f64:
+; CHECK: flogb z0.d, p0/m, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> %a,
+                                                                 <vscale x 2 x i1> %pg,
+                                                                 <vscale x 2 x double> %b)
+  ret <vscale x 2 x i64> %out
+}
+
+declare <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16>, <vscale x 8 x i1>, <vscale x 8 x half>)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x float>)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll
new file mode 100644
index 0000000000000..12cc12ccadfc2
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-fp-widening-mul-acc.ll
@@ -0,0 +1,127 @@
+;RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; FMLALB (Vectors)
+;
+
+define <vscale x 4 x float> @fmlalb_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlalb_h:
+; CHECK: fmlalb z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlalb.nxv4f32(<vscale x 4 x float> %a,
+                                                                    <vscale x 8 x half> %b,
+                                                                    <vscale x 8 x half> %c)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLALB (Indexed)
+;
+
+define <vscale x 4 x float> @fmlalb_lane_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlalb_lane_h:
+; CHECK: fmlalb z0.s, z1.h, z2.h[0]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlalb.lane.nxv4f32(<vscale x 4 x float> %a,
+                                                                         <vscale x 8 x half> %b,
+                                                                         <vscale x 8 x half> %c,
+                                                                         i32 0)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLALT (Vectors)
+;
+
+define <vscale x 4 x float> @fmlalt_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlalt_h:
+; CHECK: fmlalt z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlalt.nxv4f32(<vscale x 4 x float> %a,
+                                                                    <vscale x 8 x half> %b,
+                                                                    <vscale x 8 x half> %c)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLALT (Indexed)
+;
+
+define <vscale x 4 x float> @fmlalt_lane_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlalt_lane_h:
+; CHECK: fmlalt z0.s, z1.h, z2.h[1]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlalt.lane.nxv4f32(<vscale x 4 x float> %a,
+                                                                         <vscale x 8 x half> %b,
+                                                                         <vscale x 8 x half> %c,
+                                                                         i32 1)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLSLB (Vectors)
+;
+
+define <vscale x 4 x float> @fmlslb_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlslb_h:
+; CHECK: fmlslb z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlslb.nxv4f32(<vscale x 4 x float> %a,
+                                                                    <vscale x 8 x half> %b,
+                                                                    <vscale x 8 x half> %c)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLSLB (Indexed)
+;
+
+define <vscale x 4 x float> @fmlslb_lane_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlslb_lane_h:
+; CHECK: fmlslb z0.s, z1.h, z2.h[2]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlslb.lane.nxv4f32(<vscale x 4 x float> %a,
+                                                                         <vscale x 8 x half> %b,
+                                                                         <vscale x 8 x half> %c,
+                                                                         i32 2)
+  ret <vscale x 4 x float> %out
+}
+
+;
+; FMLSLT (Vectors)
+;
+
+define <vscale x 4 x float> @fmlslt_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlslt_h:
+; CHECK: fmlslt z0.s, z1.h, z2.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlslt.nxv4f32(<vscale x 4 x float> %a,
+                                                                    <vscale x 8 x half> %b,
+                                                                    <vscale x 8 x half> %c)
+ ret <vscale x 4 x float> %out
+}
+
+;
+; FMLSLT (Indexed)
+;
+
+define <vscale x 4 x float> @fmlslt_lane_h(<vscale x 4 x float> %a, <vscale x 8 x half> %b, <vscale x 8 x half> %c) {
+; CHECK-LABEL: fmlslt_lane_h:
+; CHECK: fmlslt z0.s, z1.h, z2.h[3]
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmlslt.lane.nxv4f32(<vscale x 4 x float> %a,
+                                                                         <vscale x 8 x half> %b,
+                                                                         <vscale x 8 x half> %c,
+                                                                         i32 3)
+  ret <vscale x 4 x float> %out
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlalb.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlalb.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlalt.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlalt.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlslb.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlslb.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlslt.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmlslt.lane.nxv4f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll
new file mode 100644
index 0000000000000..055c24b935e08
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-non-widening-pairwise-arith.ll
@@ -0,0 +1,191 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; FADDP
+;
+
+define <vscale x 8 x half> @faddp_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: faddp_f16:
+; CHECK: faddp z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.faddp.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @faddp_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: faddp_f32:
+; CHECK: faddp z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.faddp.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @faddp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: faddp_f64:
+; CHECK: faddp z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.faddp.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMAXP
+;
+
+define <vscale x 8 x half> @fmaxp_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmaxp_f16:
+; CHECK: fmaxp z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxp.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmaxp_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmaxp_f32:
+; CHECK: fmaxp z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmaxp.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmaxp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmaxp_f64:
+; CHECK: fmaxp z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmaxp.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMAXNMP
+;
+
+define <vscale x 8 x half> @fmaxnmp_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fmaxnmp_f16:
+; CHECK: fmaxnmp z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmaxnmp.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x half> %a,
+                                                                    <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fmaxnmp_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fmaxnmp_f32:
+; CHECK: fmaxnmp z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmaxnmp.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                     <vscale x 4 x float> %a,
+                                                                     <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fmaxnmp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fmaxnmp_f64:
+; CHECK: fmaxnmp z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fmaxnmp.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                      <vscale x 2 x double> %a,
+                                                                      <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMINP
+;
+
+define <vscale x 8 x half> @fminp_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fminp_f16:
+; CHECK: fminp z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fminp.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                  <vscale x 8 x half> %a,
+                                                                  <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fminp_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fminp_f32:
+; CHECK: fminp z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fminp.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                   <vscale x 4 x float> %a,
+                                                                   <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fminp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fminp_f64:
+; CHECK: fminp z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fminp.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                    <vscale x 2 x double> %a,
+                                                                    <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+;
+; FMINNMP
+;
+
+define <vscale x 8 x half> @fminnmp_f16(<vscale x 8 x i1> %pg, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fminnmp_f16:
+; CHECK: fminnmp z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x half> @llvm.aarch64.sve.fminnmp.nxv8f16(<vscale x 8 x i1> %pg,
+                                                                    <vscale x 8 x half> %a,
+                                                                    <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %out
+}
+
+define <vscale x 4 x float> @fminnmp_f32(<vscale x 4 x i1> %pg, <vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: fminnmp_f32:
+; CHECK: fminnmp z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.fminnmp.nxv4f32(<vscale x 4 x i1> %pg,
+                                                                     <vscale x 4 x float> %a,
+                                                                     <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %out
+}
+
+define <vscale x 2 x double> @fminnmp_f64(<vscale x 2 x i1> %pg, <vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: fminnmp_f64:
+; CHECK: fminnmp z0.d, p0/m, z0.d, z1.d
+; CHECK-NEXT: ret
+  %out = call <vscale x 2 x double> @llvm.aarch64.sve.fminnmp.nxv2f64(<vscale x 2 x i1> %pg,
+                                                                      <vscale x 2 x double> %a,
+                                                                      <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %out
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.faddp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.faddp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.faddp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmaxp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmaxp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmaxnmp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmaxnmp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fmaxnmp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fminp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fminp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fminp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fminnmp.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>, <vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.aarch64.sve.fminnmp.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>, <vscale x 4 x float>)
+declare <vscale x 2 x double> @llvm.aarch64.sve.fminnmp.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.mir
new file mode 100644
index 0000000000000..54849b4c651d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.mfma.mir
@@ -0,0 +1,943 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: mfma_f32_32x32x1f32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x1f32_vva
+    ; CHECK: liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x1f32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x1f32_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x1f32_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<32 x s32>) = COPY [[COPY2]](<32 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x1f32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x1f32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x1f32_vva
+    ; CHECK: liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x1f32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x1f32_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x1f32_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x1f32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x1f32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x1f32_vva
+    ; CHECK: liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x1f32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x1f32_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x1f32_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x1f32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x1f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x2f32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x2f32_vva
+    ; CHECK: liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2f32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x2f32_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x2f32_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2f32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x4f32_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x4f32_vva
+    ; CHECK: liveins: $vgpr0, $vgpr1, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f32), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x4f32_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x4f32_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f32), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x4f16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x4f16_vva
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4f16), [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x4f16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x4f16_sss
+    ; CHECK: liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr32_sgpr33
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr34_sgpr35
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY]](<4 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY1]](<4 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<32 x s32>) = COPY [[COPY2]](<32 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4f16), [[COPY3]](<4 x s16>), [[COPY4]](<4 x s16>), [[COPY5]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(<4 x s16>) = COPY $sgpr32_sgpr33
+    %1:_(<4 x s16>) = COPY $sgpr34_sgpr35
+    %2:_(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x4f16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x4f16_vva
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f16), [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x4f16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x4f16_sss
+    ; CHECK: liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr32_sgpr33
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr34_sgpr35
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY]](<4 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY1]](<4 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f16), [[COPY3]](<4 x s16>), [[COPY4]](<4 x s16>), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<4 x s16>) = COPY $sgpr32_sgpr33
+    %1:_(<4 x s16>) = COPY $sgpr34_sgpr35
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x4f16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x4f16_vva
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4f16), [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x4f16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x4f16_sss
+    ; CHECK: liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr32_sgpr33
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr34_sgpr35
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY]](<4 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY1]](<4 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4f16), [[COPY3]](<4 x s16>), [[COPY4]](<4 x s16>), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<4 x s16>) = COPY $sgpr32_sgpr33
+    %1:_(<4 x s16>) = COPY $sgpr34_sgpr35
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x4f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x8f16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x8f16_vva
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8f16), [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x8f16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x8f16_sss
+    ; CHECK: liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr32_sgpr33
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr34_sgpr35
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY]](<4 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY1]](<4 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8f16), [[COPY3]](<4 x s16>), [[COPY4]](<4 x s16>), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<4 x s16>) = COPY $sgpr32_sgpr33
+    %1:_(<4 x s16>) = COPY $sgpr34_sgpr35
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x8f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x16f16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x16f16_vva
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr2_vgpr3
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16f16), [[COPY]](<4 x s16>), [[COPY1]](<4 x s16>), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
+    %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x16f16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x16f16_sss
+    ; CHECK: liveins: $sgpr32_sgpr33, $sgpr34_sgpr35, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr32_sgpr33
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<4 x s16>) = COPY $sgpr34_sgpr35
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY]](<4 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<4 x s16>) = COPY [[COPY1]](<4 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16f16), [[COPY3]](<4 x s16>), [[COPY4]](<4 x s16>), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<4 x s16>) = COPY $sgpr32_sgpr33
+    %1:_(<4 x s16>) = COPY $sgpr34_sgpr35
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x16f16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_i32_32x32x4i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; CHECK-LABEL: name: mfma_i32_32x32x4i8_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x4i8), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_i32_32x32x4i8_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+
+    ; CHECK-LABEL: name: mfma_i32_32x32x4i8_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<32 x s32>) = COPY [[COPY2]](<32 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x4i8), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_i32_16x16x4i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_i32_16x16x4i8_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x4i8), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_i32_16x16x4i8_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_i32_16x16x4i8_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x4i8), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_i32_4x4x4i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_i32_4x4x4i8_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.4x4x4i8), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.4x4x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_i32_4x4x4i8_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_i32_4x4x4i8_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.4x4x4i8), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.4x4x4i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_i32_32x32x8i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_i32_32x32x8i8_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x8i8), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x8i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_i32_32x32x8i8_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_i32_32x32x8i8_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x8i8), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.32x32x8i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_i32_16x16x16i8_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_i32_16x16x16i8_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x16i8), [[COPY]](s32), [[COPY1]](s32), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr2
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x16i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_i32_16x16x16i8_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_i32_16x16x16i8_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x16i8), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(s32) = COPY $sgpr32
+    %1:_(s32) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.i32.16x16x16i8), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x2bf16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x2bf16_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2bf16), [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr2
+    %2:_(<32 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x2bf16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x2bf16_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<32 x s32>) = COPY [[COPY2]](<32 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2bf16), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<32 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[INT]](<32 x s32>)
+    %0:_(<2 x s16>) = COPY $sgpr32
+    %1:_(<2 x s16>) = COPY $sgpr33
+    %2:_(<32 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31
+    %3:_(<32 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x2bf16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x2bf16_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x2bf16), [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr2
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x2bf16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x2bf16_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x2bf16), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<2 x s16>) = COPY $sgpr32
+    %1:_(<2 x s16>) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x2bf16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x2bf16_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x2bf16), [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr2
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_4x4x2bf16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_4x4x2bf16_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x2bf16), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<2 x s16>) = COPY $sgpr32
+    %1:_(<2 x s16>) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.4x4x2bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x4bf16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x4bf16_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16), [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr2
+    %2:_(<16 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_32x32x4bf16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+
+    ; CHECK-LABEL: name: mfma_f32_32x32x4bf16_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<16 x s32>) = COPY [[COPY2]](<16 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<16 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[INT]](<16 x s32>)
+    %0:_(<2 x s16>) = COPY $sgpr32
+    %1:_(<2 x s16>) = COPY $sgpr33
+    %2:_(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+    %3:_(<16 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.32x32x4bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x8bf16_vva
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x8bf16_vva
+    ; CHECK: liveins: $vgpr0, $vgpr2, $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr2
+    ; CHECK: [[COPY2:%[0-9]+]]:agpr(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8bf16), [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr2
+    %2:_(<4 x s32>) = COPY $agpr0_agpr1_agpr2_agpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
+---
+name: mfma_f32_16x16x8bf16_sss
+legalized: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+
+    ; CHECK-LABEL: name: mfma_f32_16x16x8bf16_sss
+    ; CHECK: liveins: $sgpr32, $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr32
+    ; CHECK: [[COPY1:%[0-9]+]]:sgpr(<2 x s16>) = COPY $sgpr33
+    ; CHECK: [[COPY2:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY]](<2 x s16>)
+    ; CHECK: [[COPY4:%[0-9]+]]:vgpr(<2 x s16>) = COPY [[COPY1]](<2 x s16>)
+    ; CHECK: [[COPY5:%[0-9]+]]:agpr(<4 x s32>) = COPY [[COPY2]](<4 x s32>)
+    ; CHECK: [[INT:%[0-9]+]]:agpr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8bf16), [[COPY3]](<2 x s16>), [[COPY4]](<2 x s16>), [[COPY5]](<4 x s32>), 0, 0, 0
+    ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INT]](<4 x s32>)
+    %0:_(<2 x s16>) = COPY $sgpr32
+    %1:_(<2 x s16>) = COPY $sgpr33
+    %2:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+    %3:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.amdgcn.mfma.f32.16x16x8bf16), %0, %1, %2, 0, 0, 0
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
diff --git a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
index 0dec67ad340cd..895539c00bce9 100644
--- a/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
@@ -16,29 +16,28 @@ define amdgpu_ps void @main(i32, float) {
 ; CHECK-NEXT:    s_mov_b32 s0, 0
 ; CHECK-NEXT:    v_interp_p1_f32_e32 v0, v1, attr0.x
 ; CHECK-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; CHECK-NEXT:    s_branch BB0_3
 ; CHECK-NEXT:  BB0_1: ; %Flow1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
-; CHECK-NEXT:    s_mov_b64 s[10:11], 0
+; CHECK-NEXT:    s_mov_b64 s[8:9], 0
 ; CHECK-NEXT:  BB0_2: ; %Flow
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
-; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
-; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; CHECK-NEXT:    s_and_b64 s[4:5], s[10:11], exec
-; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[8:9]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_and_b64 s[10:11], exec, s[6:7]
+; CHECK-NEXT:    s_or_b64 s[2:3], s[10:11], s[2:3]
+; CHECK-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; CHECK-NEXT:    s_and_b64 s[8:9], s[8:9], exec
+; CHECK-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; CHECK-NEXT:    s_cbranch_execz BB0_6
 ; CHECK-NEXT:  BB0_3: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; CHECK-NEXT:    s_cmp_lt_u32 s0, 32
-; CHECK-NEXT:    s_mov_b64 s[10:11], -1
+; CHECK-NEXT:    s_mov_b64 s[8:9], -1
 ; CHECK-NEXT:    s_cbranch_scc0 BB0_2
 ; CHECK-NEXT:  ; %bb.4: ; %endif1
 ; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
@@ -53,9 +52,9 @@ define amdgpu_ps void @main(i32, float) {
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
 ; CHECK-NEXT:    s_branch BB0_1
 ; CHECK-NEXT:  BB0_6: ; %Flow2
-; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
 ; CHECK-NEXT:    ; mask branch BB0_8
 ; CHECK-NEXT:  BB0_7: ; %if1
 ; CHECK-NEXT:    v_sqrt_f32_e32 v1, v0
@@ -63,6 +62,7 @@ define amdgpu_ps void @main(i32, float) {
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; CHECK-NEXT:    exp mrt0 v1, v1, v1, v1 done vm
 ; CHECK-NEXT:    s_endpgm
+
 ; this is the divergent branch with the condition not marked as divergent
 start:
   %v0 = call float @llvm.amdgcn.interp.p1(float %1, i32 0, i32 0, i32 %0)
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
new file mode 100644
index 0000000000000..754536577faec
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir
@@ -0,0 +1,71 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands %s -o - | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: name: test_part_fold{{$}}
+# GCN: %2:sreg_32 = S_ADD_I32 70, %1
+---
+name: test_part_fold
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 70
+    %1:sreg_32 = S_MOV_B32 80
+    %2:sreg_32 = S_ADD_I32 %0, %1, implicit-def $scc
+...
+
+# GCN-LABEL: name: test_inline_const{{$}}
+# GCN: %2:sreg_32 = S_ADD_I32 70, 63
+---
+name: test_inline_const
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 70
+    %1:sreg_32 = S_MOV_B32 63
+    %2:sreg_32 = S_ADD_I32 %0, %1, implicit-def $scc
+...
+# GCN-LABEL: name: test_obscure{{$}}
+# GCN: %2:sreg_32 = S_LSHL2_ADD_U32 70, %1
+---
+name: test_obscure
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 70
+    %1:sreg_32 = S_MOV_B32 80
+    %2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
+...
+# GCN-LABEL: name: test_obscure_inline{{$}}
+# GCN: %2:sreg_32 = S_LSHL2_ADD_U32 70, 63
+---
+name: test_obscure_inline
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 70
+    %1:sreg_32 = S_MOV_B32 63
+    %2:sreg_32 = S_LSHL2_ADD_U32 %0, %1, implicit-def $scc
+...
+# GCN-LABEL: name: test_frameindex{{$}}
+# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, %0
+---
+name: test_frameindex
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16}
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 70
+    %1:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def $scc
+...
+# GCN-LABEL: name: test_frameindex_inline{{$}}
+# GCN: %1:sreg_32 = S_ADD_I32 %stack.0, 63
+---
+name: test_frameindex_inline
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: default, offset: 0, size: 64, alignment: 16}
+body: |
+  bb.0:
+    %0:sreg_32 = S_MOV_B32 63
+    %1:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
index fff1c22918ec6..51d1c091ab913 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -3,11 +3,10 @@
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
-; SI: [[LOOP:BB0_[0-9]+]]:  ; %Flow1
-; SI:   s_or_b64 exec, exec, [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]]
 ; SI:   ; %Flow
+; SI:  s_or_b64 [[EXIT_MASK:s\[[0-9]+:[0-9]+\]]]
 ; SI:  s_and_b64 [[ACCUM_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_MASK:s\[[0-9]+:[0-9]+\]]], exec
-; SI:  s_or_b64  [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], s[6:7], [[ACCUM_MASK]]
+; SI:  s_or_b64  [[I1_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[ACCUM_MASK]]
 ; SI:  s_cbranch_execz [[FOR_END_LABEL:BB0_[0-9]+]]
 
 ; SI: ; %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7b34d873f7a74..25742666a5794 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1297,8 +1297,30 @@ bb:
 ; GCN-LABEL: {{^}}test_mfma_f32_32x32x1f32_vecarg:
 ; GCN-DAG:         v_mov_b32_e32 [[TWO:v[0-9]+]], 2.0
 ; GCN-DAG:         v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
-; GCN-COUNT-8:     global_load_dwordx4
-; GCN-COUNT-16:    v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         global_load_dwordx4
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
+; GCN-DAG:         v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:             v_mfma_f32_32x32x1f32 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
 ; GCN-COUNT-32:    v_accvgpr_read_b32
 ; GCN-COUNT-8:     global_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 46c4b1e6b3a1c..684b183de690c 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -40,10 +40,9 @@
 
 ; GCN: [[FLOW]]: ; %Flow
 ; GCN:           ;   in Loop: Header=BB0_1 Depth=1
-; GCN:      s_and_b64         [[BROKEN_MASK]], exec, [[INNER_MASK]]
-; GCN:      s_or_b64          [[BROKEN_MASK]], [[BROKEN_MASK]], [[ACCUM_MASK]]
-; GCN:      s_mov_b64         [[ACCUM_MASK]], [[BROKEN_MASK]]
-; GCN:      s_andn2_b64       exec, exec, [[BROKEN_MASK]]
+; GCN:      s_and_b64         [[AND_MASK:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
+; GCN-NEXT: s_or_b64          [[ACCUM_MASK]], [[AND_MASK]], [[ACCUM_MASK]]
+; GCN-NEXT: s_andn2_b64       exec, exec, [[ACCUM_MASK]]
 ; GCN-NEXT: s_cbranch_execnz  [[LOOP_ENTRY]]
 
 ; GCN: ; %bb.4: ; %bb9
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 08d8ec0fba4dc..5222ae56db87a 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -25,22 +25,20 @@
 ; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2
-; GCN:      s_or_b64            exec, exec, [[TMP0:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            exec, exec, [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]]
 ; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]]
-; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
-; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_or_b64            [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_andn2_b64         exec, exec, [[LEFT_OUTER]]
 ; GCN:      s_cbranch_execz    [[IF_BLOCK:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
-; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64           [[LEFT_INNER]], 0{{$}}
 
 ; GCN: ; %Flow
 ; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]]
-; GCN:      s_and_b64           [[TMP0]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]]
-; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
-; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            [[LEFT_INNER]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_andn2_b64         exec, exec, [[LEFT_INNER]]
 ; GCN:      s_cbranch_execz    [[FLOW2]]
 
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
@@ -82,17 +80,17 @@ ENDIF:                                            ; preds = %LOOP
 ; OPT: llvm.amdgcn.end.cf
 
 ; GCN-LABEL: {{^}}multi_if_break_loop:
-; GCN:      s_mov_b64          [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
+; GCN:      s_mov_b64          [[SAVED_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %Flow4
-; GCN:      s_and_b64          [[BROKEN_THREADS_MASK]], exec, [[BROKEN_THREADS_MASK]]
-; GCN:      s_or_b64           [[BROKEN_THREADS_MASK]], [[BROKEN_THREADS_MASK]], [[SAVED:s\[[0-9]+:[0-9]+\]]]
-; GCN:      s_andn2_b64        exec, exec, [[BROKEN_THREADS_MASK]]
+; GCN:      s_and_b64          [[ANDTMP0:s\[[0-9]+:[0-9]+\]]], exec, {{s\[[0-9]+:[0-9]+\]}}
+; GCN:      s_or_b64           [[MASK1:s\[[0-9]+:[0-9]+\]]], [[ANDTMP0]], [[SAVED_MASK]]
+; GCN:      s_and_b64          [[BROKEN_THREADS_MASK:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, exec
+; GCN:      s_andn2_b64        exec, exec, [[MASK1]]
 ; GCN-NEXT: s_cbranch_execz [[LOOP_EXIT:BB[0-9]+_[0-9]+]]
 
 ; GCN: ; %bb1{{$}}
 ; GCN:      buffer_load_dword  [[LOAD0:v[0-9]+]],
-; GCN:      s_mov_b64          [[SAVED]], [[BROKEN_THREADS_MASK]]
 
 ; GCN: ; %LeafBlock1
 ; GCN:      v_cmp_eq_u32_e32 vcc, 1, [[LOAD0]]
@@ -122,7 +120,7 @@ ENDIF:                                            ; preds = %LOOP
 ; GCN:      s_branch [[LOOP]]
 
 ; GCN: [[LOOP_EXIT]]: ; %Flow6
-; GCN: 	s_or_b64 exec, exec, [[BROKEN_THREADS_MASK]]
+; GCN: 	s_or_b64 exec, exec, [[SAVED_MASK]]
 
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index dc7a7c804bee1..1c7adc39fe290 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -91,3 +91,86 @@ body:             |
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
     S_ENDPGM 0, implicit $vcc
 ...
+
+# When only one 64-bit SGPR is available for the unused carry out pre gfx9,
+# we must reuse one of the 32-bit SGPR sub-regs to materialize the offset.
+
+---
+name: scavenge_sgpr_pei_one_sgpr_64
+tracksRegLiveness: true
+
+stack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
+  - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
+
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr34
+  frameOffsetReg:  $sgpr33
+  stackPtrOffsetReg:  $sgpr32
+
+body:             |
+  bb.0:
+    liveins: $vgpr1
+
+    ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64
+    ; CHECK: liveins: $vgpr1
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    ; CHECK: $sgpr28 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr28, implicit $exec
+    ; CHECK: $sgpr28 = S_MOV_B32 8192
+    ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec
+    ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
+    ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: S_ENDPGM 0, implicit $vcc
+    S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
+    S_ENDPGM 0, implicit $vcc
+...
+
+# Prefer to use vcc as unused carry out.
+
+---
+name: scavenge_sgpr_pei_prefer_vcc
+tracksRegLiveness: true
+
+stack:
+  - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
+  - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
+
+machineFunctionInfo:
+  isEntryFunction: false
+  scratchRSrcReg:  $sgpr0_sgpr1_sgpr2_sgpr3
+  scratchWaveOffsetReg: $sgpr34
+  frameOffsetReg:  $sgpr33
+  stackPtrOffsetReg:  $sgpr32
+
+body:             |
+  bb.0:
+    liveins: $vgpr1
+
+    ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc
+    ; CHECK: liveins: $vgpr1
+    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
+    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31
+    ; CHECK: $vcc_hi = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
+    ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $vcc_hi, implicit $exec
+    ; CHECK: $vcc_lo = S_MOV_B32 8192
+    ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
+    ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
+    ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
+    ; CHECK: S_ENDPGM 0
+    S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31
+    $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 14d78fbef29ea..23bb18e738f54 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -37,9 +37,8 @@ ENDIF:
 ; SI: ; %endif
 
 ; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
-; SI:     s_mov_b64         [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
 ; SI:     s_and_b64         [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
-; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[TMP]]
+; SI:     s_or_b64          [[LEFT]], [[TMP1]], [[LEFT]]
 ; SI:     s_andn2_b64       exec, exec, [[LEFT]]
 ; SI:     s_cbranch_execnz  [[LOOP_LABEL]]
 ; SI:     s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index ef17825024eda..ea74268dbe7c2 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -223,9 +223,8 @@ exit:
 ; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
 ; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
 ; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
-; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
-; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
-; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
+; SI-NEXT: s_or_b64 [[COND_STATE]], [[TMP1]], [[COND_STATE]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
 ; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
 
 ; SI: [[LABEL_EXIT]]:
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 91a993181979d..92808fec360f4 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -243,14 +243,12 @@ bb13:
 ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], [[MASK0]]
 ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], [[MASK0]]
 ; GCN:   BB{{.*}}: ; %Flow
-; GFX1032: s_and_b32 [[MASK0:s[0-9]+]], exec_lo, [[MASK1]]
-; GFX1064: s_and_b64 [[MASK0:s\[[0-9:]+\]]], exec, [[MASK1]]
-; GFX1032: s_or_b32  [[MASK0]], [[MASK0]], [[ACC:s[0-9]+]]
-; GFX1064: s_or_b64  [[MASK0]], [[MASK0]], [[ACC:s\[[0-9:]+\]]]
-; GFX1032: s_mov_b32 [[ACC]], [[MASK0]]
-; GFX1064: s_mov_b64 [[ACC]], [[MASK0]]
-; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[MASK0]]
-; GFX1064: s_andn2_b64 exec, exec, [[MASK0]]
+; GFX1032: s_and_b32 [[TMP0:s[0-9]+]], exec_lo, [[MASK1]]
+; GFX1064: s_and_b64 [[TMP0:s\[[0-9:]+\]]], exec, [[MASK1]]
+; GFX1032: s_or_b32  [[ACC:s[0-9]+]], [[TMP0]], [[ACC]]
+; GFX1064: s_or_b64  [[ACC:s\[[0-9:]+\]]], [[TMP0]], [[ACC]]
+; GFX1032: s_andn2_b32 exec_lo, exec_lo, [[ACC]]
+; GFX1064: s_andn2_b64 exec, exec, [[ACC]]
 ; GCN:     s_cbranch_execz
 ; GCN:   BB{{.*}}:
 ; GCN: s_load_dword [[LOAD:s[0-9]+]]
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-calls.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
deleted file mode 100644
index 8d58c8e69a556..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-calls.ll
+++ /dev/null
@@ -1,230 +0,0 @@
-; RUN: llc -mtriple=thumbv8 -arm-disable-cgp=false %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv8 -arm-disable-cgp=false %s -o - | FileCheck %s
-
-; Check that the pass doesn't try to promote the immediate parameters.
-; CHECK-LABEL: call_with_imms
-; CHECK-NOT:   uxt
-define i8 @call_with_imms(i8* %arg) {
-  %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
-  %cmp = icmp eq i8 %call, 0
-  %res = select i1 %cmp, i8 %call, i8 1
-  ret i8 %res
-}
-
-; Test that the call result is still extended.
-; CHECK-LABEL: test_call:
-; CHECK: bl
-; CHECK-NEXT: sxtb r1, r0
-define i16 @test_call(i8 zeroext %arg) {
-  %call = call i8 @dummy_i8(i8 %arg)
-  %cmp = icmp ult i8 %call, 128
-  %conv = zext i1 %cmp to i16
-  ret i16 %conv
-}
-
-; CHECK-LABEL: promote_i8_sink_i16_1
-; CHECK: bl dummy_i8
-; CHECK: add{{.*}} r0, #1
-; CHECK-NOT: uxt
-; CHECK: cmp r0
-define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
-  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
-  %add = add nuw i8 %call, 1
-  %conv = zext i8 %add to i16
-  %cmp = icmp ne i16 %conv, %arg1
-  %sel = select i1 %cmp, i16 %arg1, i16 %arg2
-  %res = tail call zeroext i16 @dummy3(i16 %sel)
-  ret i16 %res
-}
-
-; CHECK-LABEL: promote_i8_sink_i16_2
-; CHECK: bl dummy_i8
-; CHECK: add{{.*}} r0, #1
-; CHECK-NOT: uxt
-; CHECK: cmp r0
-define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
-  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
-  %add = add nuw i8 %call, 1
-  %cmp = icmp ne i8 %add, %arg1
-  %conv = zext i8 %arg1 to i16
-  %sel = select i1 %cmp, i16 %conv, i16 %arg2
-  %res = tail call zeroext i16 @dummy3(i16 %sel)
-  ret i16 %res
-}
-
-@uc = global i8 42, align 1
-@LL = global i64 0, align 8
-
-; CHECK-LABEL: zext_i64
-; CHECK: ldrb
-; CHECK: strd
-define void @zext_i64() {
-entry:
-  %0 = load i8, i8* @uc, align 1
-  %conv = zext i8 %0 to i64
-  store i64 %conv, i64* @LL, align 8
-  %cmp = icmp eq i8 %0, 42
-  %conv1 = zext i1 %cmp to i32
-  %call = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 %conv1)
-  ret void
-}
-
-@a = global i16* null, align 4
-@b = global i32 0, align 4
-
-; CHECK-LABEL: constexpr
-; CHECK: uxth
-define i32 @constexpr() {
-entry:
-  store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
-  %0 = load i16*, i16** @a, align 4
-  %1 = load i16, i16* %0, align 2
-  %or = or i16 %1, ptrtoint (i32* @b to i16)
-  store i16 %or, i16* %0, align 2
-  %cmp = icmp ne i16 %or, 4
-  %conv3 = zext i1 %cmp to i32
-  %call = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 %conv3) #2
-  ret i32 undef
-}
-
-; The call to safe_lshift_func takes two parameters, but they're the same value
-; just one is zext. We do support zext now, so the transformation should
-; trigger and we don't want see uxtb here.
-; CHECK-LABEL: call_zext_i8_i32
-; CHECK-NOT: uxt
-define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
-for.cond8.preheader:
-  %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
-  %tobool219 = icmp eq i8 %call217, 0
-  br i1 %tobool219, label %for.end411, label %for.cond273.preheader
-
-for.cond273.preheader:                            ; preds = %for.cond8.preheader
-  %call217.lcssa = phi i8 [ %call217, %for.cond8.preheader ]
-  %conv218.le = zext i8 %call217.lcssa to i32
-  %call346 = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext %call217.lcssa, i32 %conv218.le)
-  unreachable
-
-for.end411:                                       ; preds = %for.cond8.preheader
-  %call452 = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4)
-  unreachable
-}
-
-%struct.anon = type { i32 }
-
-@g_57 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
-@g_893 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
-@g_82 = hidden local_unnamed_addr global i32 0, align 4
-
-; Test that the transform bails on finding %conv4, a trunc
-; CHECK-LABEL: call_return_pointer
-; CHECK: sxth
-; CHECK: uxt
-define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
-entry:
-  %conv1 = zext i8 %p_13 to i16
-  %call = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext %conv1, i32* undef)
-  %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
-  %conv2 = trunc i32 %0 to i16
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.cond.backedge, %entry
-  %p_13.addr.0 = phi i8 [ %p_13, %entry ], [ %p_13.addr.0.be, %for.cond.backedge ]
-  %tobool = icmp eq i8 %p_13.addr.0, 0
-  br i1 %tobool, label %for.cond.backedge, label %if.then
-
-for.cond.backedge:                                ; preds = %for.cond, %if.then
-  %p_13.addr.0.be = phi i8 [ %conv4, %if.then ], [ 0, %for.cond ]
-  br label %for.cond
-
-if.then:                                          ; preds = %for.cond
-  %call3 = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %conv2)
-  %conv4 = trunc i16 %call3 to i8
-  br label %for.cond.backedge
-}
-
-; Check that d.sroa.0.0.be is promoted passed directly into the tail call.
-; CHECK-LABEL: check_zext_phi_call_arg
-; CHECK-NOT: uxt
-define i32 @check_zext_phi_call_arg() {
-entry:
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.cond.backedge, %entry
-  %d.sroa.0.0 = phi i16 [ 30, %entry ], [ %d.sroa.0.0.be, %for.cond.backedge ]
-  %tobool = icmp eq i16 %d.sroa.0.0, 0
-  br i1 %tobool, label %for.cond.backedge, label %if.then
-
-for.cond.backedge:                                ; preds = %for.cond, %if.then
-  %d.sroa.0.0.be = phi i16 [ %call, %if.then ], [ 0, %for.cond ]
-  br label %for.cond
-
-if.then:                                          ; preds = %for.cond
-  %d.sroa.0.0.insert.ext = zext i16 %d.sroa.0.0 to i32
-  %call = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 %d.sroa.0.0.insert.ext) #2
-  br label %for.cond.backedge
-}
-
-%struct.atomic_flag = type { i8 }
-
-; CHECK-LABEL: atomic_flag_test_and_set
-; CHECK-NOT: uxt
-define zeroext i1 @atomic_flag_test_and_set(%struct.atomic_flag* %object) {
-entry:
-  %_Value = getelementptr inbounds %struct.atomic_flag, %struct.atomic_flag* %object, i32 0, i32 0
-  %call = tail call arm_aapcscc zeroext i8 @__atomic_exchange_1(i8* %_Value, i8 zeroext 1, i32 5) #1
-  %0 = and i8 %call, 1
-  %tobool = icmp ne i8 %0, 0
-  ret i1 %tobool
-}
-
-; CHECK-LABEL: i1_zeroext_call
-; CHECK: uxt
-define i1 @i1_zeroext_call(i16* %ts, i32 %a, i16* %b, i8* %c) {
-entry:
-  %0 = load i16, i16* %ts, align 2
-  %conv.i860 = trunc i32 %a to i16
-  store i16 %conv.i860, i16* %b, align 2
-  %call.i848 = call zeroext i1 @i1_zeroext(i8* %c, i32 64, i16 zeroext %conv.i860)
-  br i1 %call.i848, label %if.then223, label %if.else227
-
-if.then223:
-  %cmp235 = icmp eq i16 %0, %conv.i860
-  br label %exit
-
-if.else227:
-  %cmp236 = icmp ult i16 %0, %conv.i860
-  br label %exit
-
-exit:
-  %retval = phi i1 [ %cmp235, %if.then223 ], [ %cmp236, %if.else227 ]
-  ret i1 %retval
-}
-
-; CHECK-LABEL: promote_arg_pass_to_call
-; CHECK: uxtb
-define i16 @promote_arg_pass_to_call(i16 zeroext %arg1, i16 zeroext %arg2) {
-  %conv = add nuw i16 %arg1, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  %cmp = icmp ult i16 %mul, %arg2
-  %trunc = trunc i16 %arg1 to i8
-  %res = call zeroext i16 @dummy4(i1 %cmp, i8 %trunc, i16 %arg1)
-  ret i16 %res
-}
-
-
-declare i32 @assert(...)
-declare i8 @dummy_i8(i8)
-declare i8 @dummy2(i8*, i8, i8)
-declare i16 @dummy3(i16)
-declare i16 @dummy4(i1, i8, i16)
-
-declare dso_local i32 @e(...) local_unnamed_addr #1
-declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
-declare dso_local arm_aapcscc i8 @__atomic_exchange_1(i8*, i8, i32) local_unnamed_addr
-
-declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
-declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
-declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64)
-declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32)
-declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext)
-declare i1 @i1_zeroext(i8*, i32, i16 zeroext)
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-casts.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
deleted file mode 100644
index 538f110ffd767..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-casts.ll
+++ /dev/null
@@ -1,2243 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NODSP --check-prefix=CHECK-NODSP-V8
-; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NODSP --check-prefix=CHECK-NODSP-V7
-; RUN: llc -mtriple=thumbv7em -mcpu=cortex-m7 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DSP
-; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DSP-IMM
-
-; Transform will fail because the trunc is not a sink.
-
-define i16 @dsp_trunc(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
-; CHECK-NODSP-V8-LABEL: dsp_trunc:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    add r1, r0
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r2]
-; CHECK-NODSP-V8-NEXT:    ldrh r2, [r3]
-; CHECK-NODSP-V8-NEXT:    add r0, r1
-; CHECK-NODSP-V8-NEXT:    subs r1, r2, r1
-; CHECK-NODSP-V8-NEXT:    uxth r3, r0
-; CHECK-NODSP-V8-NEXT:    uxth r2, r1
-; CHECK-NODSP-V8-NEXT:    cmp r3, r2
-; CHECK-NODSP-V8-NEXT:    it lo
-; CHECK-NODSP-V8-NEXT:    movlo r0, r1
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: dsp_trunc:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrh r2, [r2]
-; CHECK-NODSP-V7-NEXT:    add r1, r0
-; CHECK-NODSP-V7-NEXT:    ldrh r3, [r3]
-; CHECK-NODSP-V7-NEXT:    adds r0, r2, r1
-; CHECK-NODSP-V7-NEXT:    subs r1, r3, r1
-; CHECK-NODSP-V7-NEXT:    uxth r3, r0
-; CHECK-NODSP-V7-NEXT:    uxth r2, r1
-; CHECK-NODSP-V7-NEXT:    cmp r3, r2
-; CHECK-NODSP-V7-NEXT:    it lo
-; CHECK-NODSP-V7-NEXT:    movlo r0, r1
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: dsp_trunc:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    add r0, r1
-; CHECK-DSP-NEXT:    ldrh r1, [r3]
-; CHECK-DSP-NEXT:    ldrh r2, [r2]
-; CHECK-DSP-NEXT:    subs r1, r1, r0
-; CHECK-DSP-NEXT:    add r0, r2
-; CHECK-DSP-NEXT:    uxth r3, r1
-; CHECK-DSP-NEXT:    uxth r2, r0
-; CHECK-DSP-NEXT:    cmp r2, r3
-; CHECK-DSP-NEXT:    it lo
-; CHECK-DSP-NEXT:    movlo r0, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: dsp_trunc:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    add r0, r1
-; CHECK-DSP-IMM-NEXT:    movs r1, #0
-; CHECK-DSP-IMM-NEXT:    uxth r0, r0
-; CHECK-DSP-IMM-NEXT:    usub16 r1, r1, r0
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r2]
-; CHECK-DSP-IMM-NEXT:    ldrh r3, [r3]
-; CHECK-DSP-IMM-NEXT:    usub16 r0, r0, r1
-; CHECK-DSP-IMM-NEXT:    uadd16 r1, r3, r1
-; CHECK-DSP-IMM-NEXT:    cmp r0, r1
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %add0 = add i32 %arg0, %arg1
-  %conv0 = trunc i32 %add0 to i16
-  %sub0 = sub i16 0, %conv0
-  %load0 = load i16, i16* %gep0, align 2
-  %load1 = load i16, i16* %gep1, align 2
-  %sub1 = sub i16 %load0, %sub0
-  %add1 = add i16 %load1, %sub0
-  %cmp = icmp ult i16 %sub1, %add1
-  %res = select i1 %cmp, i16 %add1, i16 %sub1
-  ret i16 %res
-}
-
-define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) {
-; CHECK-LABEL: trunc_i16_i8:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ldrh r0, [r0]
-; CHECK-NEXT:    add r0, r1
-; CHECK-NEXT:    uxtb r0, r0
-; CHECK-NEXT:    cmp r0, r2
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r0, r2
-; CHECK-NEXT:    bx lr
-entry:
-  %0 = load i16, i16* %ptr
-  %1 = add i16 %0, %arg0
-  %2 = trunc i16 %1 to i8
-  %3 = icmp ugt i8 %2, %arg1
-  %4 = select i1 %3, i8 %2, i8 %arg1
-  ret i8 %4
-}
-
-; The pass perform the transform, but a uxtb will still be inserted to handle
-; the zext to the icmp.
-define i8 @icmp_i32_zext(i8* %ptr) {
-; CHECK-NODSP-V8-LABEL: icmp_i32_zext:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrb r2, [r0], #1
-; CHECK-NODSP-V8-NEXT:    subs r1, r2, #1
-; CHECK-NODSP-V8-NEXT:    .p2align 2
-; CHECK-NODSP-V8-NEXT:  .LBB2_1: @ %body
-; CHECK-NODSP-V8-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V8-NEXT:    uxtb r3, r1
-; CHECK-NODSP-V8-NEXT:    cmp r2, r3
-; CHECK-NODSP-V8-NEXT:    itt ne
-; CHECK-NODSP-V8-NEXT:    movne r0, r1
-; CHECK-NODSP-V8-NEXT:    bxne lr
-; CHECK-NODSP-V8-NEXT:    ldrb r1, [r0, r2]
-; CHECK-NODSP-V8-NEXT:    adds r2, #1
-; CHECK-NODSP-V8-NEXT:    b .LBB2_1
-;
-; CHECK-NODSP-V7-LABEL: icmp_i32_zext:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrb r2, [r0], #1
-; CHECK-NODSP-V7-NEXT:    subs r1, r2, #1
-; CHECK-NODSP-V7-NEXT:  .LBB2_1: @ %body
-; CHECK-NODSP-V7-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V7-NEXT:    uxtb r3, r1
-; CHECK-NODSP-V7-NEXT:    cmp r2, r3
-; CHECK-NODSP-V7-NEXT:    itt ne
-; CHECK-NODSP-V7-NEXT:    movne r0, r1
-; CHECK-NODSP-V7-NEXT:    bxne lr
-; CHECK-NODSP-V7-NEXT:    ldrb r1, [r0, r2]
-; CHECK-NODSP-V7-NEXT:    adds r2, #1
-; CHECK-NODSP-V7-NEXT:    b .LBB2_1
-;
-; CHECK-DSP-LABEL: icmp_i32_zext:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrb r2, [r0], #1
-; CHECK-DSP-NEXT:    subs r1, r2, #1
-; CHECK-DSP-NEXT:  .LBB2_1: @ %body
-; CHECK-DSP-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-NEXT:    uxtb r3, r1
-; CHECK-DSP-NEXT:    cmp r2, r3
-; CHECK-DSP-NEXT:    itt ne
-; CHECK-DSP-NEXT:    movne r0, r1
-; CHECK-DSP-NEXT:    bxne lr
-; CHECK-DSP-NEXT:    ldrb r1, [r0, r2]
-; CHECK-DSP-NEXT:    adds r2, #1
-; CHECK-DSP-NEXT:    b .LBB2_1
-;
-; CHECK-DSP-IMM-LABEL: icmp_i32_zext:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrb r2, [r0], #1
-; CHECK-DSP-IMM-NEXT:    subs r1, r2, #1
-; CHECK-DSP-IMM-NEXT:  .LBB2_1: @ %body
-; CHECK-DSP-IMM-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-IMM-NEXT:    uxtb r3, r1
-; CHECK-DSP-IMM-NEXT:    cmp r2, r3
-; CHECK-DSP-IMM-NEXT:    bne .LBB2_3
-; CHECK-DSP-IMM-NEXT:  @ %bb.2: @ %if.end
-; CHECK-DSP-IMM-NEXT:    @ in Loop: Header=BB2_1 Depth=1
-; CHECK-DSP-IMM-NEXT:    ldrb r1, [r0, r2]
-; CHECK-DSP-IMM-NEXT:    adds r2, #1
-; CHECK-DSP-IMM-NEXT:    b .LBB2_1
-; CHECK-DSP-IMM-NEXT:  .LBB2_3: @ %exit
-; CHECK-DSP-IMM-NEXT:    mov r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
-  %0 = load i8, i8* %gep, align 1
-  %1 = sub nuw nsw i8 %0, 1
-  %conv44 = zext i8 %0 to i32
-  br label %preheader
-
-preheader:
-  br label %body
-
-body:
-  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
-  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
-  %conv51266 = zext i8 %2 to i32
-  %cmp52267 = icmp eq i32 %si.0274, %conv51266
-  br i1 %cmp52267, label %if.end, label %exit
-
-if.end:
-  %inc = add i32 %si.0274, 1
-  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
-  %3 = load i8, i8* %gep1, align 1
-  br label %body
-
-exit:
-  ret i8 %2
-}
-
-; Won't don't handle sext
-define i32 @icmp_sext_zext_store_i8_i16() {
-; CHECK-NODSP-V8-LABEL: icmp_sext_zext_store_i8_i16:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    movw r0, :lower16:d_uch
-; CHECK-NODSP-V8-NEXT:    movt r0, :upper16:d_uch
-; CHECK-NODSP-V8-NEXT:    ldrb r1, [r0, #2]
-; CHECK-NODSP-V8-NEXT:    movw r0, :lower16:d_sh
-; CHECK-NODSP-V8-NEXT:    movt r0, :upper16:d_sh
-; CHECK-NODSP-V8-NEXT:    ldrsh.w r0, [r0, #4]
-; CHECK-NODSP-V8-NEXT:    movw r2, :lower16:sh1
-; CHECK-NODSP-V8-NEXT:    subs r0, r1, r0
-; CHECK-NODSP-V8-NEXT:    clz r0, r0
-; CHECK-NODSP-V8-NEXT:    movt r2, :upper16:sh1
-; CHECK-NODSP-V8-NEXT:    lsrs r0, r0, #5
-; CHECK-NODSP-V8-NEXT:    strh r1, [r2]
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: icmp_sext_zext_store_i8_i16:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    movw r0, :lower16:d_sh
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:d_uch
-; CHECK-NODSP-V7-NEXT:    movt r0, :upper16:d_sh
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:d_uch
-; CHECK-NODSP-V7-NEXT:    ldrb r1, [r1, #2]
-; CHECK-NODSP-V7-NEXT:    movw r2, :lower16:sh1
-; CHECK-NODSP-V7-NEXT:    ldrsh.w r0, [r0, #4]
-; CHECK-NODSP-V7-NEXT:    movt r2, :upper16:sh1
-; CHECK-NODSP-V7-NEXT:    strh r1, [r2]
-; CHECK-NODSP-V7-NEXT:    subs r0, r1, r0
-; CHECK-NODSP-V7-NEXT:    clz r0, r0
-; CHECK-NODSP-V7-NEXT:    lsrs r0, r0, #5
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: icmp_sext_zext_store_i8_i16:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    movw r0, :lower16:d_uch
-; CHECK-DSP-NEXT:    movw r1, :lower16:sh1
-; CHECK-DSP-NEXT:    movt r0, :upper16:d_uch
-; CHECK-DSP-NEXT:    movt r1, :upper16:sh1
-; CHECK-DSP-NEXT:    ldrb r0, [r0, #2]
-; CHECK-DSP-NEXT:    strh r0, [r1]
-; CHECK-DSP-NEXT:    movw r1, :lower16:d_sh
-; CHECK-DSP-NEXT:    movt r1, :upper16:d_sh
-; CHECK-DSP-NEXT:    ldrsh.w r1, [r1, #4]
-; CHECK-DSP-NEXT:    subs r0, r0, r1
-; CHECK-DSP-NEXT:    clz r0, r0
-; CHECK-DSP-NEXT:    lsrs r0, r0, #5
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: icmp_sext_zext_store_i8_i16:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    movw r0, :lower16:d_sh
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:d_uch
-; CHECK-DSP-IMM-NEXT:    movt r0, :upper16:d_sh
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:d_uch
-; CHECK-DSP-IMM-NEXT:    ldrb r1, [r1, #2]
-; CHECK-DSP-IMM-NEXT:    movw r2, :lower16:sh1
-; CHECK-DSP-IMM-NEXT:    ldrsh.w r0, [r0, #4]
-; CHECK-DSP-IMM-NEXT:    movt r2, :upper16:sh1
-; CHECK-DSP-IMM-NEXT:    strh r1, [r2]
-; CHECK-DSP-IMM-NEXT:    subs r0, r1, r0
-; CHECK-DSP-IMM-NEXT:    clz r0, r0
-; CHECK-DSP-IMM-NEXT:    lsrs r0, r0, #5
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
-  %conv = zext i8 %0 to i16
-  store i16 %conv, i16* @sh1, align 2
-  %conv1 = zext i8 %0 to i32
-  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
-  %conv2 = sext i16 %1 to i32
-  %cmp = icmp eq i32 %conv1, %conv2
-  %conv3 = zext i1 %cmp to i32
-  ret i32 %conv3
-}
-
-define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
-; CHECK-NODSP-V8-LABEL: or_icmp_ugt:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrb r1, [r1]
-; CHECK-NODSP-V8-NEXT:    adds r2, r1, #3
-; CHECK-NODSP-V8-NEXT:    subs.w r0, r0, r2, lsl #1
-; CHECK-NODSP-V8-NEXT:    it ne
-; CHECK-NODSP-V8-NEXT:    movne r0, #1
-; CHECK-NODSP-V8-NEXT:    subs r1, #1
-; CHECK-NODSP-V8-NEXT:    movs r2, #0
-; CHECK-NODSP-V8-NEXT:    cmp r1, #3
-; CHECK-NODSP-V8-NEXT:    it hi
-; CHECK-NODSP-V8-NEXT:    movhi r2, #1
-; CHECK-NODSP-V8-NEXT:    orrs r0, r2
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: or_icmp_ugt:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrb r1, [r1]
-; CHECK-NODSP-V7-NEXT:    adds r2, r1, #3
-; CHECK-NODSP-V7-NEXT:    subs r1, #1
-; CHECK-NODSP-V7-NEXT:    subs.w r0, r0, r2, lsl #1
-; CHECK-NODSP-V7-NEXT:    mov.w r2, #0
-; CHECK-NODSP-V7-NEXT:    it ne
-; CHECK-NODSP-V7-NEXT:    movne r0, #1
-; CHECK-NODSP-V7-NEXT:    cmp r1, #3
-; CHECK-NODSP-V7-NEXT:    it hi
-; CHECK-NODSP-V7-NEXT:    movhi r2, #1
-; CHECK-NODSP-V7-NEXT:    orrs r0, r2
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: or_icmp_ugt:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrb r1, [r1]
-; CHECK-DSP-NEXT:    adds r2, r1, #3
-; CHECK-DSP-NEXT:    subs r1, #1
-; CHECK-DSP-NEXT:    subs.w r0, r0, r2, lsl #1
-; CHECK-DSP-NEXT:    mov.w r2, #0
-; CHECK-DSP-NEXT:    it ne
-; CHECK-DSP-NEXT:    movne r0, #1
-; CHECK-DSP-NEXT:    cmp r1, #3
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movhi r2, #1
-; CHECK-DSP-NEXT:    orrs r0, r2
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: or_icmp_ugt:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrb r1, [r1]
-; CHECK-DSP-IMM-NEXT:    adds r2, r1, #3
-; CHECK-DSP-IMM-NEXT:    subs.w r0, r0, r2, lsl #1
-; CHECK-DSP-IMM-NEXT:    it ne
-; CHECK-DSP-IMM-NEXT:    movne r0, #1
-; CHECK-DSP-IMM-NEXT:    subs r1, #1
-; CHECK-DSP-IMM-NEXT:    movs r2, #0
-; CHECK-DSP-IMM-NEXT:    cmp r1, #3
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r2, #1
-; CHECK-DSP-IMM-NEXT:    orrs r0, r2
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i8, i8* %ptr
-  %1 = zext i8 %0 to i32
-  %mul = shl nuw nsw i32 %1, 1
-  %add0 = add nuw nsw i32 %mul, 6
-  %cmp0 = icmp ne i32 %arg, %add0
-  %add1 = add i8 %0, -1
-  %cmp1 = icmp ugt i8 %add1, 3
-  %or = or i1 %cmp0, %cmp1
-  ret i1 %or
-}
-
-; We currently only handle truncs as sinks, so a uxt will still be needed for
-; the icmp ugt instruction.
-define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
-; CHECK-NODSP-V8-LABEL: urem_trunc_icmps:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldr r0, [r0]
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    cbz r0, .LBB5_3
-; CHECK-NODSP-V8-NEXT:  @ %bb.1: @ %cond.false.i
-; CHECK-NODSP-V8-NEXT:    movs r3, #5
-; CHECK-NODSP-V8-NEXT:    udiv r3, r3, r0
-; CHECK-NODSP-V8-NEXT:    muls r0, r3, r0
-; CHECK-NODSP-V8-NEXT:    rsb.w r0, r0, #5
-; CHECK-NODSP-V8-NEXT:    .p2align 2
-; CHECK-NODSP-V8-NEXT:  .LBB5_2: @ %body
-; CHECK-NODSP-V8-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V8-NEXT:    uxtb r3, r0
-; CHECK-NODSP-V8-NEXT:    cmp r3, #7
-; CHECK-NODSP-V8-NEXT:    mov.w r3, #0
-; CHECK-NODSP-V8-NEXT:    it hi
-; CHECK-NODSP-V8-NEXT:    movhi r3, #1
-; CHECK-NODSP-V8-NEXT:    str r3, [r1]
-; CHECK-NODSP-V8-NEXT:    ldr r3, [r2]
-; CHECK-NODSP-V8-NEXT:    cmp r3, #0
-; CHECK-NODSP-V8-NEXT:    it ne
-; CHECK-NODSP-V8-NEXT:    bxne lr
-; CHECK-NODSP-V8-NEXT:    adds r0, #1
-; CHECK-NODSP-V8-NEXT:    b .LBB5_2
-; CHECK-NODSP-V8-NEXT:  .LBB5_3: @ %exit
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: urem_trunc_icmps:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NODSP-V7-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NODSP-V7-NEXT:    ldr r0, [r0]
-; CHECK-NODSP-V7-NEXT:    mov r5, r1
-; CHECK-NODSP-V7-NEXT:    ldrh r1, [r0]
-; CHECK-NODSP-V7-NEXT:    cbz r1, .LBB5_4
-; CHECK-NODSP-V7-NEXT:  @ %bb.1: @ %cond.false.i
-; CHECK-NODSP-V7-NEXT:    movs r0, #5
-; CHECK-NODSP-V7-NEXT:    mov r4, r2
-; CHECK-NODSP-V7-NEXT:    bl __aeabi_uidivmod
-; CHECK-NODSP-V7-NEXT:  .LBB5_2: @ %body
-; CHECK-NODSP-V7-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r1
-; CHECK-NODSP-V7-NEXT:    cmp r0, #7
-; CHECK-NODSP-V7-NEXT:    mov.w r0, #0
-; CHECK-NODSP-V7-NEXT:    it hi
-; CHECK-NODSP-V7-NEXT:    movhi r0, #1
-; CHECK-NODSP-V7-NEXT:    str r0, [r5]
-; CHECK-NODSP-V7-NEXT:    ldr r0, [r4]
-; CHECK-NODSP-V7-NEXT:    cbnz r0, .LBB5_4
-; CHECK-NODSP-V7-NEXT:  @ %bb.3: @ %for.inc
-; CHECK-NODSP-V7-NEXT:    @ in Loop: Header=BB5_2 Depth=1
-; CHECK-NODSP-V7-NEXT:    adds r1, #1
-; CHECK-NODSP-V7-NEXT:    b .LBB5_2
-; CHECK-NODSP-V7-NEXT:  .LBB5_4: @ %exit
-; CHECK-NODSP-V7-NEXT:    pop {r4, r5, r7, pc}
-;
-; CHECK-DSP-LABEL: urem_trunc_icmps:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldr r0, [r0]
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    cbz r0, .LBB5_3
-; CHECK-DSP-NEXT:  @ %bb.1: @ %cond.false.i
-; CHECK-DSP-NEXT:    movs r3, #5
-; CHECK-DSP-NEXT:    udiv r3, r3, r0
-; CHECK-DSP-NEXT:    muls r0, r3, r0
-; CHECK-DSP-NEXT:    rsb.w r0, r0, #5
-; CHECK-DSP-NEXT:  .LBB5_2: @ %body
-; CHECK-DSP-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-NEXT:    uxtb r3, r0
-; CHECK-DSP-NEXT:    cmp r3, #7
-; CHECK-DSP-NEXT:    mov.w r3, #0
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movhi r3, #1
-; CHECK-DSP-NEXT:    str r3, [r1]
-; CHECK-DSP-NEXT:    ldr r3, [r2]
-; CHECK-DSP-NEXT:    cmp r3, #0
-; CHECK-DSP-NEXT:    it ne
-; CHECK-DSP-NEXT:    bxne lr
-; CHECK-DSP-NEXT:    adds r0, #1
-; CHECK-DSP-NEXT:    b .LBB5_2
-; CHECK-DSP-NEXT:  .LBB5_3: @ %exit
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: urem_trunc_icmps:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldr r0, [r0]
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    cbz r0, .LBB5_4
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %cond.false.i
-; CHECK-DSP-IMM-NEXT:    movs r3, #5
-; CHECK-DSP-IMM-NEXT:    udiv r3, r3, r0
-; CHECK-DSP-IMM-NEXT:    muls r0, r3, r0
-; CHECK-DSP-IMM-NEXT:    rsb.w r0, r0, #5
-; CHECK-DSP-IMM-NEXT:  .LBB5_2: @ %body
-; CHECK-DSP-IMM-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-IMM-NEXT:    uxtb r3, r0
-; CHECK-DSP-IMM-NEXT:    cmp r3, #7
-; CHECK-DSP-IMM-NEXT:    mov.w r3, #0
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r3, #1
-; CHECK-DSP-IMM-NEXT:    str r3, [r1]
-; CHECK-DSP-IMM-NEXT:    ldr r3, [r2]
-; CHECK-DSP-IMM-NEXT:    cbnz r3, .LBB5_4
-; CHECK-DSP-IMM-NEXT:  @ %bb.3: @ %for.inc
-; CHECK-DSP-IMM-NEXT:    @ in Loop: Header=BB5_2 Depth=1
-; CHECK-DSP-IMM-NEXT:    adds r0, #1
-; CHECK-DSP-IMM-NEXT:    b .LBB5_2
-; CHECK-DSP-IMM-NEXT:  .LBB5_4: @ %exit
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %ptr = load i16*, i16** %in, align 4
-  %ld = load i16, i16* %ptr, align 2
-  %cmp.i = icmp eq i16 %ld, 0
-  br i1 %cmp.i, label %exit, label %cond.false.i
-
-cond.false.i:
-  %rem = urem i16 5, %ld
-  %extract.t = trunc i16 %rem to i8
-  br label %body
-
-body:
-  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
-  %cmp = icmp ugt i8 %cond.in.i.off0, 7
-  %conv5 = zext i1 %cmp to i32
-  store i32 %conv5, i32* %g, align 4
-  %.pr = load i32, i32* %k, align 4
-  %tobool13150 = icmp eq i32 %.pr, 0
-  br i1 %tobool13150, label %for.inc, label %exit
-
-for.inc:
-  %add = add nuw i8 %cond.in.i.off0, 1
-  br label %body
-
-exit:
-  ret void
-}
-
-; Check that %exp requires uxth in all cases, and will also be required to
-; promote %1 for the call - unless we can generate a uadd16.
-define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
-; CHECK-NODSP-LABEL: zext_load_sink_call:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-NEXT:    uxth r2, r1
-; CHECK-NODSP-NEXT:    cmp r0, r2
-; CHECK-NODSP-NEXT:    itt eq
-; CHECK-NODSP-NEXT:    moveq r0, #0
-; CHECK-NODSP-NEXT:    bxeq lr
-; CHECK-NODSP-NEXT:    adds r1, #3
-; CHECK-NODSP-NEXT:    uxth r1, r1
-; CHECK-NODSP-NEXT:    b dummy
-;
-; CHECK-DSP-LABEL: zext_load_sink_call:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    uxth r2, r1
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    itt eq
-; CHECK-DSP-NEXT:    moveq r0, #0
-; CHECK-DSP-NEXT:    bxeq lr
-; CHECK-DSP-NEXT:    adds r1, #3
-; CHECK-DSP-NEXT:    uxth r1, r1
-; CHECK-DSP-NEXT:    b dummy
-;
-; CHECK-DSP-IMM-LABEL: zext_load_sink_call:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    uxth r2, r1
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    movs r1, #3
-; CHECK-DSP-IMM-NEXT:    uadd16 r1, r2, r1
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    bne .LBB6_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %exit
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB6_2: @ %if.then
-; CHECK-DSP-IMM-NEXT:    b dummy
-entry:
-  %0 = load i16, i16* %ptr, align 4
-  %1 = add i16 %exp, 3
-  %cmp = icmp eq i16 %0, %exp
-  br i1 %cmp, label %exit, label %if.then
-
-if.then:
-  %conv0 = zext i16 %0 to i32
-  %conv1 = zext i16 %1 to i32
-  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
-  br label %exit
-
-exit:
-  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
-  ret i32 %exitval
-}
-
-define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) {
-; CHECK-NODSP-LABEL: bitcast_i16:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    adds r0, #1
-; CHECK-NODSP-NEXT:    movw r2, #12345
-; CHECK-NODSP-NEXT:    cmp r0, r2
-; CHECK-NODSP-NEXT:    it hi
-; CHECK-NODSP-NEXT:    movwhi r1, #32657
-; CHECK-NODSP-NEXT:    mov r0, r1
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: bitcast_i16:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    adds r0, #1
-; CHECK-DSP-NEXT:    movw r2, #12345
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movwhi r1, #32657
-; CHECK-DSP-NEXT:    mov r0, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: bitcast_i16:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    adds r2, r0, #1
-; CHECK-DSP-IMM-NEXT:    movw r0, #32657
-; CHECK-DSP-IMM-NEXT:    movw r3, #12345
-; CHECK-DSP-IMM-NEXT:    cmp r2, r3
-; CHECK-DSP-IMM-NEXT:    it ls
-; CHECK-DSP-IMM-NEXT:    movls r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %cast = bitcast i16 12345 to i16
-  %add = add nuw i16 %arg0, 1
-  %cmp = icmp ule i16 %add, %cast
-  %res = select i1 %cmp, i16 %arg1, i16 32657
-  ret i16 %res
-}
-
-define i8 @bitcast_i8(i8 zeroext %arg0, i8 zeroext %arg1) {
-; CHECK-LABEL: bitcast_i8:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    mvn r2, #127
-; CHECK-NEXT:    cmp.w r1, r0, lsl #1
-; CHECK-NEXT:    it ls
-; CHECK-NEXT:    movls r2, #127
-; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    bx lr
-entry:
-  %cast = bitcast i8 127 to i8
-  %mul = shl nuw i8 %arg0, 1
-  %cmp = icmp uge i8 %mul, %arg1
-  %res = select i1 %cmp, i8 %cast, i8 128
-  ret i8 %res
-}
-
-define i16 @bitcast_i16_minus(i16 zeroext %arg0, i16 zeroext %arg1) {
-; CHECK-NODSP-LABEL: bitcast_i16_minus:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    eor r2, r0, #7
-; CHECK-NODSP-NEXT:    movw r0, #32657
-; CHECK-NODSP-NEXT:    cmp r2, r1
-; CHECK-NODSP-NEXT:    itt eq
-; CHECK-NODSP-NEXT:    movweq r0, #53191
-; CHECK-NODSP-NEXT:    movteq r0, #65535
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: bitcast_i16_minus:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    eor r2, r0, #7
-; CHECK-DSP-NEXT:    movw r0, #32657
-; CHECK-DSP-NEXT:    cmp r2, r1
-; CHECK-DSP-NEXT:    itt eq
-; CHECK-DSP-NEXT:    movweq r0, #53191
-; CHECK-DSP-NEXT:    movteq r0, #65535
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: bitcast_i16_minus:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    eor r2, r0, #7
-; CHECK-DSP-IMM-NEXT:    movw r0, #32657
-; CHECK-DSP-IMM-NEXT:    cmp r2, r1
-; CHECK-DSP-IMM-NEXT:    it eq
-; CHECK-DSP-IMM-NEXT:    movweq r0, #53191
-; CHECK-DSP-IMM-NEXT:    it eq
-; CHECK-DSP-IMM-NEXT:    movteq r0, #65535
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %cast = bitcast i16 -12345 to i16
-  %xor = xor i16 %arg0, 7
-  %cmp = icmp eq i16 %xor, %arg1
-  %res = select i1 %cmp, i16 %cast, i16 32657
-  ret i16 %res
-}
-
-define i8 @bitcast_i8_minus(i8 zeroext %arg0, i8 zeroext %arg1) {
-; CHECK-LABEL: bitcast_i8_minus:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    and r2, r0, #3
-; CHECK-NEXT:    mvn r0, #127
-; CHECK-NEXT:    cmp r2, r1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    mvnne r0, #126
-; CHECK-NEXT:    bx lr
-entry:
-  %cast = bitcast i8 -127 to i8
-  %and = and i8 %arg0, 3
-  %cmp = icmp ne i8 %and, %arg1
-  %res = select i1 %cmp, i8 %cast, i8 128
-  ret i8 %res
-}
-
-declare i32 @dummy(i32, i32)
-
-@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
-@sh1 = hidden local_unnamed_addr global i16 0, align 2
-@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
-
-define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
-; CHECK-NODSP-V8-LABEL: two_stage_zext_trunc_mix:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldr.w r12, [sp]
-; CHECK-NODSP-V8-NEXT:    ldrb r2, [r0]
-; CHECK-NODSP-V8-NEXT:    add.w r0, r3, r12
-; CHECK-NODSP-V8-NEXT:    lsls r2, r2, #31
-; CHECK-NODSP-V8-NEXT:    it eq
-; CHECK-NODSP-V8-NEXT:    addeq r0, r3, r1
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: two_stage_zext_trunc_mix:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrb r2, [r0]
-; CHECK-NODSP-V7-NEXT:    ldr.w r12, [sp]
-; CHECK-NODSP-V7-NEXT:    add.w r0, r3, r12
-; CHECK-NODSP-V7-NEXT:    lsls r2, r2, #31
-; CHECK-NODSP-V7-NEXT:    it eq
-; CHECK-NODSP-V7-NEXT:    addeq r0, r3, r1
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: two_stage_zext_trunc_mix:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldr r2, [sp]
-; CHECK-DSP-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-NEXT:    add r2, r3
-; CHECK-DSP-NEXT:    lsls r0, r0, #31
-; CHECK-DSP-NEXT:    it eq
-; CHECK-DSP-NEXT:    addeq r2, r3, r1
-; CHECK-DSP-NEXT:    mov r0, r2
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: two_stage_zext_trunc_mix:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldr.w r12, [sp]
-; CHECK-DSP-IMM-NEXT:    ldrb r2, [r0]
-; CHECK-DSP-IMM-NEXT:    adds r0, r3, r1
-; CHECK-DSP-IMM-NEXT:    add r12, r3
-; CHECK-DSP-IMM-NEXT:    lsls r1, r2, #31
-; CHECK-DSP-IMM-NEXT:    it ne
-; CHECK-DSP-IMM-NEXT:    movne r0, r12
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %__size_.i.i.i.i = bitcast i32** %__str to i8*
-  %0 = load i8, i8* %__size_.i.i.i.i, align 4
-  %1 = and i8 %0, 1
-  %tobool.i.i.i.i = icmp eq i8 %1, 0
-  %__size_.i5.i.i = getelementptr inbounds i32*, i32** %__str, i32 %__n1
-  %cast = bitcast i32** %__size_.i5.i.i to i32*
-  %2 = load i32, i32* %cast, align 4
-  %3 = lshr i8 %0, 1
-  %4 = zext i8 %3 to i32
-  %cond.i.i = select i1 %tobool.i.i.i.i, i32 %4, i32 %2
-  %__size_.i.i.i.i.i = bitcast i32* %this to i8*
-  %5 = load i8, i8* %__size_.i.i.i.i.i, align 4
-  %6 = and i8 %5, 1
-  %tobool.i.i.i.i.i = icmp eq i8 %6, 0
-  %7 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos1
-  %8 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos2
-  %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
-  ret i8* %res
-}
-
-define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
-; CHECK-NODSP-V8-LABEL: search_through_zext_1:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    subs r3, r1, r0
-; CHECK-NODSP-V8-NEXT:    add.w r12, r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp r3, r2
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp r12, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: search_through_zext_1:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    subs r3, r1, r0
-; CHECK-NODSP-V7-NEXT:    cmp r3, r2
-; CHECK-NODSP-V7-NEXT:    mov r3, r1
-; CHECK-NODSP-V7-NEXT:    it hi
-; CHECK-NODSP-V7-NEXT:    movhi r3, r0
-; CHECK-NODSP-V7-NEXT:    add r0, r1
-; CHECK-NODSP-V7-NEXT:    cmp r0, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r3, #0
-; CHECK-NODSP-V7-NEXT:    mov r0, r3
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: search_through_zext_1:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    subs r3, r1, r0
-; CHECK-DSP-NEXT:    cmp r3, r2
-; CHECK-DSP-NEXT:    mov r3, r1
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movhi r3, r0
-; CHECK-DSP-NEXT:    add r0, r1
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r3, #0
-; CHECK-DSP-NEXT:    mov r0, r3
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: search_through_zext_1:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    subs r3, r1, r0
-; CHECK-DSP-IMM-NEXT:    cmp r3, r2
-; CHECK-DSP-IMM-NEXT:    mov r3, r1
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r3, r0
-; CHECK-DSP-IMM-NEXT:    add r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    cmp r1, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r3
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %add = add nuw i8 %a, %b
-  %conv = zext i8 %add to i16
-  %cmp = icmp ult i16 %conv, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %sub = sub nuw i8 %b, %a
-  %conv2 = zext i8 %sub to i16
-  %cmp2 = icmp ugt i16 %conv2, %c
-  %res = select i1 %cmp2, i8 %a, i8 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
-  ret i8 %retval
-}
-
-; TODO: We should be able to remove the uxtb here. The transform fails because
-; the icmp ugt uses an i32, which is too large... but this doesn't matter
-; because it won't be writing a large value to a register as a result.
-define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
-; CHECK-NODSP-V8-LABEL: search_through_zext_2:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    push {r7, lr}
-; CHECK-NODSP-V8-NEXT:    sub.w lr, r1, r0
-; CHECK-NODSP-V8-NEXT:    add.w r12, r0, r1
-; CHECK-NODSP-V8-NEXT:    uxtb.w lr, lr
-; CHECK-NODSP-V8-NEXT:    uxtb.w r12, r12
-; CHECK-NODSP-V8-NEXT:    cmp lr, r3
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp r12, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    pop {r7, pc}
-;
-; CHECK-NODSP-V7-LABEL: search_through_zext_2:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    sub.w r12, r1, r0
-; CHECK-NODSP-V7-NEXT:    uxtb.w r12, r12
-; CHECK-NODSP-V7-NEXT:    cmp r12, r3
-; CHECK-NODSP-V7-NEXT:    mov r3, r1
-; CHECK-NODSP-V7-NEXT:    it hi
-; CHECK-NODSP-V7-NEXT:    movhi r3, r0
-; CHECK-NODSP-V7-NEXT:    add r0, r1
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r0
-; CHECK-NODSP-V7-NEXT:    cmp r0, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r3, #0
-; CHECK-NODSP-V7-NEXT:    mov r0, r3
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: search_through_zext_2:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    sub.w r12, r1, r0
-; CHECK-DSP-NEXT:    uxtb.w r12, r12
-; CHECK-DSP-NEXT:    cmp r12, r3
-; CHECK-DSP-NEXT:    mov r3, r1
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movhi r3, r0
-; CHECK-DSP-NEXT:    add r0, r1
-; CHECK-DSP-NEXT:    uxtb r0, r0
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r3, #0
-; CHECK-DSP-NEXT:    mov r0, r3
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: search_through_zext_2:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    sub.w r12, r1, r0
-; CHECK-DSP-IMM-NEXT:    uxtb.w r12, r12
-; CHECK-DSP-IMM-NEXT:    cmp r12, r3
-; CHECK-DSP-IMM-NEXT:    mov r3, r1
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r3, r0
-; CHECK-DSP-IMM-NEXT:    add r0, r1
-; CHECK-DSP-IMM-NEXT:    uxtb r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    cmp r1, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r3
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %add = add nuw i8 %a, %b
-  %conv = zext i8 %add to i16
-  %cmp = icmp ult i16 %conv, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %sub = sub nuw i8 %b, %a
-  %conv2 = zext i8 %sub to i32
-  %cmp2 = icmp ugt i32 %conv2, %d
-  %res = select i1 %cmp2, i8 %a, i8 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
-  ret i8 %retval
-}
-
-; TODO: We should be able to remove the uxtb here as all the calculations are
-; performed on i8s. The promotion of i8 to i16 and then the later truncation
-; results in the uxtb.
-define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
-; CHECK-NODSP-LABEL: search_through_zext_3:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    add.w r12, r0, r1
-; CHECK-NODSP-NEXT:    uxtb.w r12, r12
-; CHECK-NODSP-NEXT:    cmp r12, r2
-; CHECK-NODSP-NEXT:    itt hs
-; CHECK-NODSP-NEXT:    movhs r0, #0
-; CHECK-NODSP-NEXT:    bxhs lr
-; CHECK-NODSP-NEXT:    sub.w r2, r1, r12
-; CHECK-NODSP-NEXT:    uxtb r2, r2
-; CHECK-NODSP-NEXT:    cmp r2, r3
-; CHECK-NODSP-NEXT:    it ls
-; CHECK-NODSP-NEXT:    movls r0, r1
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: search_through_zext_3:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    add.w r12, r0, r1
-; CHECK-DSP-NEXT:    uxtb.w r12, r12
-; CHECK-DSP-NEXT:    cmp r12, r2
-; CHECK-DSP-NEXT:    itt hs
-; CHECK-DSP-NEXT:    movhs r0, #0
-; CHECK-DSP-NEXT:    bxhs lr
-; CHECK-DSP-NEXT:    sub.w r2, r1, r12
-; CHECK-DSP-NEXT:    uxtb r2, r2
-; CHECK-DSP-NEXT:    cmp r2, r3
-; CHECK-DSP-NEXT:    it ls
-; CHECK-DSP-NEXT:    movls r0, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: search_through_zext_3:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    add.w r12, r0, r1
-; CHECK-DSP-IMM-NEXT:    uxtb.w r12, r12
-; CHECK-DSP-IMM-NEXT:    cmp r12, r2
-; CHECK-DSP-IMM-NEXT:    bhs .LBB14_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %if.then
-; CHECK-DSP-IMM-NEXT:    sub.w r2, r1, r12
-; CHECK-DSP-IMM-NEXT:    uxtb r2, r2
-; CHECK-DSP-IMM-NEXT:    cmp r2, r3
-; CHECK-DSP-IMM-NEXT:    it ls
-; CHECK-DSP-IMM-NEXT:    movls r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB14_2:
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %add = add nuw i8 %a, %b
-  %conv = zext i8 %add to i16
-  %cmp = icmp ult i16 %conv, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %conv to i8
-  %sub = sub nuw i8 %b, %trunc
-  %conv2 = zext i8 %sub to i32
-  %cmp2 = icmp ugt i32 %conv2, %d
-  %res = select i1 %cmp2, i8 %a, i8 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
-  ret i8 %retval
-}
-
-; TODO: We should be able to remove the uxt that gets introduced for %conv2
-define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
-; CHECK-NODSP-V8-LABEL: search_through_zext_cmp:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    subs r3, r1, r0
-; CHECK-NODSP-V8-NEXT:    subs.w r12, r1, r0
-; CHECK-NODSP-V8-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V8-NEXT:    it ne
-; CHECK-NODSP-V8-NEXT:    movne.w r12, #1
-; CHECK-NODSP-V8-NEXT:    cmp r3, r2
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp r12, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: search_through_zext_cmp:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    subs r3, r1, r0
-; CHECK-NODSP-V7-NEXT:    subs.w r12, r1, r0
-; CHECK-NODSP-V7-NEXT:    it ne
-; CHECK-NODSP-V7-NEXT:    movne.w r12, #1
-; CHECK-NODSP-V7-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V7-NEXT:    cmp r3, r2
-; CHECK-NODSP-V7-NEXT:    it ls
-; CHECK-NODSP-V7-NEXT:    movls r0, r1
-; CHECK-NODSP-V7-NEXT:    cmp r12, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r0, #0
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: search_through_zext_cmp:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    subs r3, r1, r0
-; CHECK-DSP-NEXT:    subs.w r12, r1, r0
-; CHECK-DSP-NEXT:    uxtb r3, r3
-; CHECK-DSP-NEXT:    it ne
-; CHECK-DSP-NEXT:    movne.w r12, #1
-; CHECK-DSP-NEXT:    cmp r3, r2
-; CHECK-DSP-NEXT:    it ls
-; CHECK-DSP-NEXT:    movls r0, r1
-; CHECK-DSP-NEXT:    cmp r12, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r0, #0
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: search_through_zext_cmp:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    subs.w r12, r1, r0
-; CHECK-DSP-IMM-NEXT:    it ne
-; CHECK-DSP-IMM-NEXT:    movne.w r12, #1
-; CHECK-DSP-IMM-NEXT:    subs r3, r1, r0
-; CHECK-DSP-IMM-NEXT:    uxtb r3, r3
-; CHECK-DSP-IMM-NEXT:    cmp r3, r2
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    cmp r12, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %cmp = icmp ne i8 %a, %b
-  %conv = zext i1 %cmp to i16
-  %cmp1 = icmp ult i16 %conv, %c
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %sub = sub nuw i8 %b, %a
-  %conv2 = zext i8 %sub to i16
-  %cmp3 = icmp ugt i16 %conv2, %c
-  %res = select i1 %cmp3, i8 %a, i8 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
-  ret i8 %retval
-}
-
-define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
-; CHECK-NODSP-V8-LABEL: search_through_zext_load:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrb r3, [r0]
-; CHECK-NODSP-V8-NEXT:    mov r0, r1
-; CHECK-NODSP-V8-NEXT:    subs r1, r1, r3
-; CHECK-NODSP-V8-NEXT:    cmp r1, r2
-; CHECK-NODSP-V8-NEXT:    it hi
-; CHECK-NODSP-V8-NEXT:    movhi r0, r3
-; CHECK-NODSP-V8-NEXT:    cmp r3, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: search_through_zext_load:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V7-NEXT:    subs r3, r1, r0
-; CHECK-NODSP-V7-NEXT:    cmp r3, r2
-; CHECK-NODSP-V7-NEXT:    it hi
-; CHECK-NODSP-V7-NEXT:    movhi r1, r0
-; CHECK-NODSP-V7-NEXT:    cmp r0, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r1, #0
-; CHECK-NODSP-V7-NEXT:    mov r0, r1
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: search_through_zext_load:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-NEXT:    subs r3, r1, r0
-; CHECK-DSP-NEXT:    cmp r3, r2
-; CHECK-DSP-NEXT:    it hi
-; CHECK-DSP-NEXT:    movhi r1, r0
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r1, #0
-; CHECK-DSP-NEXT:    mov r0, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: search_through_zext_load:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrb r3, [r0]
-; CHECK-DSP-IMM-NEXT:    subs r0, r1, r3
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r1, r3
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    cmp r3, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r1
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %load = load i8, i8* %a
-  %conv = zext i8 %load to i16
-  %cmp1 = icmp ult i16 %conv, %c
-  br i1 %cmp1, label %if.then, label %if.end
-
-if.then:
-  %sub = sub nuw i8 %b, %load
-  %conv2 = zext i8 %sub to i16
-  %cmp3 = icmp ugt i16 %conv2, %c
-  %res = select i1 %cmp3, i8 %load, i8 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
-  ret i8 %retval
-}
-
-define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) {
-; CHECK-NODSP-V8-LABEL: trunc_sink_less_than_cmp:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    push {r7, lr}
-; CHECK-NODSP-V8-NEXT:    sub.w r12, r1, r0
-; CHECK-NODSP-V8-NEXT:    adds r3, #1
-; CHECK-NODSP-V8-NEXT:    uxth.w lr, r12
-; CHECK-NODSP-V8-NEXT:    uxtb.w r12, r12
-; CHECK-NODSP-V8-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V8-NEXT:    cmp r12, r3
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp lr, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    pop {r7, pc}
-;
-; CHECK-NODSP-V7-LABEL: trunc_sink_less_than_cmp:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r7, lr}
-; CHECK-NODSP-V7-NEXT:    push {r7, lr}
-; CHECK-NODSP-V7-NEXT:    adds r3, #1
-; CHECK-NODSP-V7-NEXT:    sub.w r12, r1, r0
-; CHECK-NODSP-V7-NEXT:    uxtb.w lr, r12
-; CHECK-NODSP-V7-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V7-NEXT:    cmp lr, r3
-; CHECK-NODSP-V7-NEXT:    it ls
-; CHECK-NODSP-V7-NEXT:    movls r0, r1
-; CHECK-NODSP-V7-NEXT:    uxth.w r1, r12
-; CHECK-NODSP-V7-NEXT:    cmp r1, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r0, #0
-; CHECK-NODSP-V7-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-LABEL: trunc_sink_less_than_cmp:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    push {r7, lr}
-; CHECK-DSP-NEXT:    adds r3, #1
-; CHECK-DSP-NEXT:    sub.w r12, r1, r0
-; CHECK-DSP-NEXT:    uxtb.w lr, r12
-; CHECK-DSP-NEXT:    uxtb r3, r3
-; CHECK-DSP-NEXT:    cmp lr, r3
-; CHECK-DSP-NEXT:    it ls
-; CHECK-DSP-NEXT:    movls r0, r1
-; CHECK-DSP-NEXT:    uxth.w r1, r12
-; CHECK-DSP-NEXT:    cmp r1, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r0, #0
-; CHECK-DSP-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-IMM-LABEL: trunc_sink_less_than_cmp:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    push {r7, lr}
-; CHECK-DSP-IMM-NEXT:    adds r3, #1
-; CHECK-DSP-IMM-NEXT:    sub.w r12, r1, r0
-; CHECK-DSP-IMM-NEXT:    uxtb r3, r3
-; CHECK-DSP-IMM-NEXT:    uxtb.w lr, r12
-; CHECK-DSP-IMM-NEXT:    cmp lr, r3
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    uxth.w r3, r12
-; CHECK-DSP-IMM-NEXT:    cmp r3, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r1
-; CHECK-DSP-IMM-NEXT:    pop {r7, pc}
-entry:
-  %sub = sub nuw i16 %b, %a
-  %cmp = icmp ult i16 %sub, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %sub to i8
-  %add = add nuw i8 %d, 1
-  %cmp2 = icmp ugt i8 %trunc, %add
-  %res = select i1 %cmp2, i16 %a, i16 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
-  ret i16 %retval
-}
-
-; TODO: We should be able to remove the uxth introduced to handle %sub
-define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
-; CHECK-NODSP-V8-LABEL: trunc_sink_less_than_arith:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    push {r4, lr}
-; CHECK-NODSP-V8-NEXT:    sub.w lr, r1, r0
-; CHECK-NODSP-V8-NEXT:    ldr.w r12, [sp, #8]
-; CHECK-NODSP-V8-NEXT:    add r3, lr
-; CHECK-NODSP-V8-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V8-NEXT:    uxth.w r4, lr
-; CHECK-NODSP-V8-NEXT:    cmp r12, r3
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, r1
-; CHECK-NODSP-V8-NEXT:    cmp r4, r2
-; CHECK-NODSP-V8-NEXT:    it hs
-; CHECK-NODSP-V8-NEXT:    movhs r0, #0
-; CHECK-NODSP-V8-NEXT:    pop {r4, pc}
-;
-; CHECK-NODSP-V7-LABEL: trunc_sink_less_than_arith:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r7, lr}
-; CHECK-NODSP-V7-NEXT:    push {r7, lr}
-; CHECK-NODSP-V7-NEXT:    sub.w lr, r1, r0
-; CHECK-NODSP-V7-NEXT:    ldr.w r12, [sp, #8]
-; CHECK-NODSP-V7-NEXT:    add r3, lr
-; CHECK-NODSP-V7-NEXT:    uxtb r3, r3
-; CHECK-NODSP-V7-NEXT:    cmp r12, r3
-; CHECK-NODSP-V7-NEXT:    it ls
-; CHECK-NODSP-V7-NEXT:    movls r0, r1
-; CHECK-NODSP-V7-NEXT:    uxth.w r1, lr
-; CHECK-NODSP-V7-NEXT:    cmp r1, r2
-; CHECK-NODSP-V7-NEXT:    it hs
-; CHECK-NODSP-V7-NEXT:    movhs r0, #0
-; CHECK-NODSP-V7-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-LABEL: trunc_sink_less_than_arith:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    push {r7, lr}
-; CHECK-DSP-NEXT:    sub.w r12, r1, r0
-; CHECK-DSP-NEXT:    add r3, r12
-; CHECK-DSP-NEXT:    uxtb.w lr, r3
-; CHECK-DSP-NEXT:    ldr r3, [sp, #8]
-; CHECK-DSP-NEXT:    cmp r3, lr
-; CHECK-DSP-NEXT:    it ls
-; CHECK-DSP-NEXT:    movls r0, r1
-; CHECK-DSP-NEXT:    uxth.w r1, r12
-; CHECK-DSP-NEXT:    cmp r1, r2
-; CHECK-DSP-NEXT:    it hs
-; CHECK-DSP-NEXT:    movhs r0, #0
-; CHECK-DSP-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-IMM-LABEL: trunc_sink_less_than_arith:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    push {r7, lr}
-; CHECK-DSP-IMM-NEXT:    sub.w lr, r1, r0
-; CHECK-DSP-IMM-NEXT:    ldr.w r12, [sp, #8]
-; CHECK-DSP-IMM-NEXT:    add r3, lr
-; CHECK-DSP-IMM-NEXT:    uxtb r3, r3
-; CHECK-DSP-IMM-NEXT:    cmp r12, r3
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    uxth.w r3, lr
-; CHECK-DSP-IMM-NEXT:    cmp r3, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r1
-; CHECK-DSP-IMM-NEXT:    pop {r7, pc}
-entry:
-  %sub = sub nuw i16 %b, %a
-  %cmp = icmp ult i16 %sub, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %sub to i8
-  %add = add nuw i8 %d, %trunc
-  %cmp2 = icmp ugt i8 %e, %add
-  %res = select i1 %cmp2, i16 %a, i16 %b
-  br label %if.end
-
-if.end:
-  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
-  ret i16 %retval
-}
-
-define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) {
-; CHECK-NODSP-LABEL: trunc_sink_less_than_store:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    subs r0, r1, r0
-; CHECK-NODSP-NEXT:    cmp r0, r2
-; CHECK-NODSP-NEXT:    iteee hs
-; CHECK-NODSP-NEXT:    movhs r0, #0
-; CHECK-NODSP-NEXT:    ldrlo r1, [sp]
-; CHECK-NODSP-NEXT:    addlo r2, r3, r0
-; CHECK-NODSP-NEXT:    strblo r2, [r1]
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: trunc_sink_less_than_store:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    subs r0, r1, r0
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    iteee hs
-; CHECK-DSP-NEXT:    movhs r0, #0
-; CHECK-DSP-NEXT:    ldrlo r1, [sp]
-; CHECK-DSP-NEXT:    addlo r2, r3, r0
-; CHECK-DSP-NEXT:    strblo r2, [r1]
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: trunc_sink_less_than_store:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    subs r0, r1, r0
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    bhs .LBB19_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %if.then
-; CHECK-DSP-IMM-NEXT:    ldr r1, [sp]
-; CHECK-DSP-IMM-NEXT:    adds r2, r3, r0
-; CHECK-DSP-IMM-NEXT:    strb r2, [r1]
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB19_2:
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %sub = sub nuw i16 %b, %a
-  %cmp = icmp ult i16 %sub, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %sub to i8
-  %add = add nuw i8 %d, %trunc
-  store i8 %add, i8* %e
-  br label %if.end
-
-if.end:
-  %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ]
-  ret i16 %retval
-}
-
-define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
-; CHECK-NODSP-LABEL: trunc_sink_less_than_ret:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    subs r1, r1, r0
-; CHECK-NODSP-NEXT:    movs r0, #0
-; CHECK-NODSP-NEXT:    cmp r1, r2
-; CHECK-NODSP-NEXT:    it lo
-; CHECK-NODSP-NEXT:    uxtablo r0, r3, r1
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: trunc_sink_less_than_ret:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    subs r1, r1, r0
-; CHECK-DSP-NEXT:    movs r0, #0
-; CHECK-DSP-NEXT:    cmp r1, r2
-; CHECK-DSP-NEXT:    it lo
-; CHECK-DSP-NEXT:    uxtablo r0, r3, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: trunc_sink_less_than_ret:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    subs r1, r1, r0
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    cmp r1, r2
-; CHECK-DSP-IMM-NEXT:    uxtab r3, r3, r1
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r0, r3
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %sub = sub nuw i16 %b, %a
-  %cmp = icmp ult i16 %sub, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %sub to i8
-  %add = add nuw i8 %d, %trunc
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
-  ret i8 %retval
-}
-
-define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
-; CHECK-NODSP-LABEL: trunc_sink_less_than_zext_ret:
-; CHECK-NODSP:       @ %bb.0: @ %entry
-; CHECK-NODSP-NEXT:    subs r0, r1, r0
-; CHECK-NODSP-NEXT:    movs r1, #0
-; CHECK-NODSP-NEXT:    cmp r0, r2
-; CHECK-NODSP-NEXT:    it lo
-; CHECK-NODSP-NEXT:    addlo r1, r3, r0
-; CHECK-NODSP-NEXT:    uxtb r0, r1
-; CHECK-NODSP-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: trunc_sink_less_than_zext_ret:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    subs r0, r1, r0
-; CHECK-DSP-NEXT:    movs r1, #0
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    it lo
-; CHECK-DSP-NEXT:    addlo r1, r3, r0
-; CHECK-DSP-NEXT:    uxtb r0, r1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: trunc_sink_less_than_zext_ret:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    subs r0, r1, r0
-; CHECK-DSP-IMM-NEXT:    adds r1, r3, r0
-; CHECK-DSP-IMM-NEXT:    movs r3, #0
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    it lo
-; CHECK-DSP-IMM-NEXT:    movlo r3, r1
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r3
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %sub = sub nuw i16 %b, %a
-  %cmp = icmp ult i16 %sub, %c
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  %trunc = trunc i16 %sub to i8
-  %add = add nuw i8 %d, %trunc
-  br label %if.end
-
-if.end:
-  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
-  ret i8 %retval
-}
-
-define i32 @bitcast_i1(i16 zeroext %a, i32 %b, i32 %c) {
-; CHECK-LABEL: bitcast_i1:
-; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    ands r0, r0, #1
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne r0, r1
-; CHECK-NEXT:    bx lr
-entry:
-  %0 = bitcast i1 1 to i1
-  %1 = trunc i16 %a to i1
-  %cmp = icmp eq i1 %1, %0
-  br i1 %cmp, label %if.then, label %exit
-
-if.then:
-  %conv = zext i1 %0 to i16
-  %conv1 = zext i1 %1 to i16
-  %cmp1 = icmp uge i16 %conv, %conv1
-  %select = select i1 %cmp1, i32 %b, i32 %c
-  br label %exit
-
-exit:
-  %retval = phi i32 [ %select, %if.then ], [ 0, %entry ]
-  ret i32 %retval
-}
-
-define void @search_back_through_trunc(i8* %a, i8* %b, i8* %c, i8* %d, i16* %e) {
-; CHECK-NODSP-V8-LABEL: search_back_through_trunc:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    push {r7, lr}
-; CHECK-NODSP-V8-NEXT:    ldrb.w r12, [r0]
-; CHECK-NODSP-V8-NEXT:    ldrb.w lr, [r1]
-; CHECK-NODSP-V8-NEXT:    ldrb r1, [r2]
-; CHECK-NODSP-V8-NEXT:    ldrb r0, [r3]
-; CHECK-NODSP-V8-NEXT:    orr.w r12, lr, r12, lsl #8
-; CHECK-NODSP-V8-NEXT:    orr.w r0, r0, r1, lsl #8
-; CHECK-NODSP-V8-NEXT:    cmp r12, r0
-; CHECK-NODSP-V8-NEXT:    beq .LBB23_2
-; CHECK-NODSP-V8-NEXT:  @ %bb.1: @ %if.else136
-; CHECK-NODSP-V8-NEXT:    ldr r0, [sp, #8]
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    uxtb.w lr, r0
-; CHECK-NODSP-V8-NEXT:    lsrs r1, r0, #8
-; CHECK-NODSP-V8-NEXT:  .LBB23_2: @ %if.end183
-; CHECK-NODSP-V8-NEXT:    strb r1, [r2]
-; CHECK-NODSP-V8-NEXT:    strb.w lr, [r3]
-; CHECK-NODSP-V8-NEXT:    pop {r7, pc}
-;
-; CHECK-NODSP-V7-LABEL: search_back_through_trunc:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r4, lr}
-; CHECK-NODSP-V7-NEXT:    push {r4, lr}
-; CHECK-NODSP-V7-NEXT:    ldrb r4, [r0]
-; CHECK-NODSP-V7-NEXT:    ldrb.w r12, [r2]
-; CHECK-NODSP-V7-NEXT:    ldrb r0, [r1]
-; CHECK-NODSP-V7-NEXT:    ldrb.w lr, [r3]
-; CHECK-NODSP-V7-NEXT:    orr.w r4, r0, r4, lsl #8
-; CHECK-NODSP-V7-NEXT:    orr.w r1, lr, r12, lsl #8
-; CHECK-NODSP-V7-NEXT:    cmp r4, r1
-; CHECK-NODSP-V7-NEXT:    itttt ne
-; CHECK-NODSP-V7-NEXT:    ldrne r0, [sp, #8]
-; CHECK-NODSP-V7-NEXT:    ldrhne r0, [r0]
-; CHECK-NODSP-V7-NEXT:    lsrne.w r12, r0, #8
-; CHECK-NODSP-V7-NEXT:    uxtbne r0, r0
-; CHECK-NODSP-V7-NEXT:    strb.w r12, [r2]
-; CHECK-NODSP-V7-NEXT:    strb r0, [r3]
-; CHECK-NODSP-V7-NEXT:    pop {r4, pc}
-;
-; CHECK-DSP-LABEL: search_back_through_trunc:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    push {r4, lr}
-; CHECK-DSP-NEXT:    ldrb r4, [r0]
-; CHECK-DSP-NEXT:    ldrb r0, [r1]
-; CHECK-DSP-NEXT:    ldrb.w r12, [r2]
-; CHECK-DSP-NEXT:    ldrb.w lr, [r3]
-; CHECK-DSP-NEXT:    orr.w lr, lr, r12, lsl #8
-; CHECK-DSP-NEXT:    orr.w r1, r0, r4, lsl #8
-; CHECK-DSP-NEXT:    cmp r1, lr
-; CHECK-DSP-NEXT:    itttt ne
-; CHECK-DSP-NEXT:    ldrne r0, [sp, #8]
-; CHECK-DSP-NEXT:    ldrhne r0, [r0]
-; CHECK-DSP-NEXT:    lsrne.w r12, r0, #8
-; CHECK-DSP-NEXT:    uxtbne r0, r0
-; CHECK-DSP-NEXT:    strb.w r12, [r2]
-; CHECK-DSP-NEXT:    strb r0, [r3]
-; CHECK-DSP-NEXT:    pop {r4, pc}
-;
-; CHECK-DSP-IMM-LABEL: search_back_through_trunc:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    push {r4, lr}
-; CHECK-DSP-IMM-NEXT:    ldrb r4, [r0]
-; CHECK-DSP-IMM-NEXT:    ldrb.w r12, [r2]
-; CHECK-DSP-IMM-NEXT:    ldrb r0, [r1]
-; CHECK-DSP-IMM-NEXT:    ldrb.w lr, [r3]
-; CHECK-DSP-IMM-NEXT:    orr.w r4, r0, r4, lsl #8
-; CHECK-DSP-IMM-NEXT:    orr.w r1, lr, r12, lsl #8
-; CHECK-DSP-IMM-NEXT:    cmp r4, r1
-; CHECK-DSP-IMM-NEXT:    beq .LBB23_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %if.else136
-; CHECK-DSP-IMM-NEXT:    ldr r0, [sp, #8]
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    lsr.w r12, r0, #8
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r0
-; CHECK-DSP-IMM-NEXT:  .LBB23_2: @ %if.end183
-; CHECK-DSP-IMM-NEXT:    strb.w r12, [r2]
-; CHECK-DSP-IMM-NEXT:    strb r0, [r3]
-; CHECK-DSP-IMM-NEXT:    pop {r4, pc}
-entry:
-  %0 = load i8, i8* %a, align 1
-  %conv106 = zext i8 %0 to i16
-  %shl = shl nuw i16 %conv106, 8
-  %1 = load i8, i8* %b, align 1
-  %conv108 = zext i8 %1 to i16
-  %or109 = or i16 %shl, %conv108
-  %2 = load i8, i8* %c, align 1
-  %conv119 = zext i8 %2 to i16
-  %shl120 = shl nuw i16 %conv119, 8
-  %3 = load i8, i8* %d, align 1
-  %conv122 = zext i8 %3 to i16
-  %or123 = or i16 %shl120, %conv122
-  %cmp133 = icmp eq i16 %or109, %or123
-  br i1 %cmp133, label %if.end183, label %if.else136
-
-if.else136:
-  %4 = load i16, i16* %e, align 2
-  %extract.t854 = trunc i16 %4 to i8
-  %extract856 = lshr i16 %4, 8
-  %extract.t857 = trunc i16 %extract856 to i8
-  br label %if.end183
-
-if.end183:
-  %w.0.off0 = phi i8 [ %extract.t854, %if.else136 ], [ %1, %entry ]
-  %w.0.off8 = phi i8 [ %extract.t857, %if.else136 ], [ %2, %entry ]
-  store i8 %w.0.off8, i8* %c, align 1
-  store i8 %w.0.off0, i8* %d, align 1
-  ret void
-}
-
-@c = common dso_local local_unnamed_addr global i16 0, align 2
-@b = common dso_local local_unnamed_addr global i16 0, align 2
-@f = common dso_local local_unnamed_addr global i32 0, align 4
-@e = common dso_local local_unnamed_addr global i8 0, align 1
-@a = common dso_local local_unnamed_addr global i8 0, align 1
-@d = common dso_local local_unnamed_addr global i32 0, align 4
-
-define void @and_trunc_two_zext() {
-; CHECK-NODSP-V8-LABEL: and_trunc_two_zext:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    movw r1, :lower16:b
-; CHECK-NODSP-V8-NEXT:    movt r1, :upper16:b
-; CHECK-NODSP-V8-NEXT:    ldrh r1, [r1]
-; CHECK-NODSP-V8-NEXT:    movw r3, :lower16:f
-; CHECK-NODSP-V8-NEXT:    sxth r2, r1
-; CHECK-NODSP-V8-NEXT:    movt r3, :upper16:f
-; CHECK-NODSP-V8-NEXT:    str r2, [r3]
-; CHECK-NODSP-V8-NEXT:    movw r3, :lower16:a
-; CHECK-NODSP-V8-NEXT:    movt r3, :upper16:a
-; CHECK-NODSP-V8-NEXT:    movw r0, :lower16:c
-; CHECK-NODSP-V8-NEXT:    movw r2, :lower16:e
-; CHECK-NODSP-V8-NEXT:    ldrb r3, [r3]
-; CHECK-NODSP-V8-NEXT:    movt r0, :upper16:c
-; CHECK-NODSP-V8-NEXT:    and r1, r1, #1
-; CHECK-NODSP-V8-NEXT:    movt r2, :upper16:e
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    strb r1, [r2]
-; CHECK-NODSP-V8-NEXT:    muls r1, r3, r1
-; CHECK-NODSP-V8-NEXT:    uxtb r1, r1
-; CHECK-NODSP-V8-NEXT:    movw r2, :lower16:d
-; CHECK-NODSP-V8-NEXT:    orrs r0, r1
-; CHECK-NODSP-V8-NEXT:    movt r2, :upper16:d
-; CHECK-NODSP-V8-NEXT:    lsls r0, r0, #16
-; CHECK-NODSP-V8-NEXT:    str r1, [r2]
-; CHECK-NODSP-V8-NEXT:    it eq
-; CHECK-NODSP-V8-NEXT:    bxeq lr
-; CHECK-NODSP-V8-NEXT:    .p2align 2
-; CHECK-NODSP-V8-NEXT:  .LBB24_1: @ %for.cond
-; CHECK-NODSP-V8-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V8-NEXT:    b .LBB24_1
-;
-; CHECK-NODSP-V7-LABEL: and_trunc_two_zext:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:b
-; CHECK-NODSP-V7-NEXT:    movw r2, :lower16:a
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:b
-; CHECK-NODSP-V7-NEXT:    movt r2, :upper16:a
-; CHECK-NODSP-V7-NEXT:    ldrh r1, [r1]
-; CHECK-NODSP-V7-NEXT:    movw r0, :lower16:c
-; CHECK-NODSP-V7-NEXT:    ldrb r2, [r2]
-; CHECK-NODSP-V7-NEXT:    movt r0, :upper16:c
-; CHECK-NODSP-V7-NEXT:    and r3, r1, #1
-; CHECK-NODSP-V7-NEXT:    ldrh.w r12, [r0]
-; CHECK-NODSP-V7-NEXT:    movw r0, :lower16:e
-; CHECK-NODSP-V7-NEXT:    muls r2, r3, r2
-; CHECK-NODSP-V7-NEXT:    movt r0, :upper16:e
-; CHECK-NODSP-V7-NEXT:    strb r3, [r0]
-; CHECK-NODSP-V7-NEXT:    sxth r0, r1
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:f
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:f
-; CHECK-NODSP-V7-NEXT:    str r0, [r1]
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:d
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:d
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r2
-; CHECK-NODSP-V7-NEXT:    str r0, [r1]
-; CHECK-NODSP-V7-NEXT:    orr.w r0, r0, r12
-; CHECK-NODSP-V7-NEXT:    lsls r0, r0, #16
-; CHECK-NODSP-V7-NEXT:    it eq
-; CHECK-NODSP-V7-NEXT:    bxeq lr
-; CHECK-NODSP-V7-NEXT:  .LBB24_1: @ %for.cond
-; CHECK-NODSP-V7-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NODSP-V7-NEXT:    b .LBB24_1
-;
-; CHECK-DSP-LABEL: and_trunc_two_zext:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    movw r0, :lower16:b
-; CHECK-DSP-NEXT:    movw r2, :lower16:f
-; CHECK-DSP-NEXT:    movt r0, :upper16:b
-; CHECK-DSP-NEXT:    movt r2, :upper16:f
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    sxth r1, r0
-; CHECK-DSP-NEXT:    and r0, r0, #1
-; CHECK-DSP-NEXT:    str r1, [r2]
-; CHECK-DSP-NEXT:    movw r1, :lower16:e
-; CHECK-DSP-NEXT:    movt r1, :upper16:e
-; CHECK-DSP-NEXT:    strb r0, [r1]
-; CHECK-DSP-NEXT:    movw r1, :lower16:a
-; CHECK-DSP-NEXT:    movt r1, :upper16:a
-; CHECK-DSP-NEXT:    ldrb r1, [r1]
-; CHECK-DSP-NEXT:    muls r0, r1, r0
-; CHECK-DSP-NEXT:    movw r1, :lower16:d
-; CHECK-DSP-NEXT:    uxtb r0, r0
-; CHECK-DSP-NEXT:    movt r1, :upper16:d
-; CHECK-DSP-NEXT:    str r0, [r1]
-; CHECK-DSP-NEXT:    movw r1, :lower16:c
-; CHECK-DSP-NEXT:    movt r1, :upper16:c
-; CHECK-DSP-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-NEXT:    orrs r0, r1
-; CHECK-DSP-NEXT:    lsls r0, r0, #16
-; CHECK-DSP-NEXT:    it eq
-; CHECK-DSP-NEXT:    bxeq lr
-; CHECK-DSP-NEXT:  .LBB24_1: @ %for.cond
-; CHECK-DSP-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-NEXT:    b .LBB24_1
-;
-; CHECK-DSP-IMM-LABEL: and_trunc_two_zext:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:b
-; CHECK-DSP-IMM-NEXT:    movw r2, :lower16:a
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:b
-; CHECK-DSP-IMM-NEXT:    movt r2, :upper16:a
-; CHECK-DSP-IMM-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-IMM-NEXT:    movw r0, :lower16:c
-; CHECK-DSP-IMM-NEXT:    ldrb r2, [r2]
-; CHECK-DSP-IMM-NEXT:    movt r0, :upper16:c
-; CHECK-DSP-IMM-NEXT:    and r3, r1, #1
-; CHECK-DSP-IMM-NEXT:    ldrh.w r12, [r0]
-; CHECK-DSP-IMM-NEXT:    movw r0, :lower16:e
-; CHECK-DSP-IMM-NEXT:    muls r2, r3, r2
-; CHECK-DSP-IMM-NEXT:    movt r0, :upper16:e
-; CHECK-DSP-IMM-NEXT:    strb r3, [r0]
-; CHECK-DSP-IMM-NEXT:    sxth r0, r1
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:f
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:f
-; CHECK-DSP-IMM-NEXT:    str r0, [r1]
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:d
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r2
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:d
-; CHECK-DSP-IMM-NEXT:    str r0, [r1]
-; CHECK-DSP-IMM-NEXT:    orr.w r0, r0, r12
-; CHECK-DSP-IMM-NEXT:    lsls r0, r0, #16
-; CHECK-DSP-IMM-NEXT:    beq .LBB24_2
-; CHECK-DSP-IMM-NEXT:  .LBB24_1: @ %for.cond
-; CHECK-DSP-IMM-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-DSP-IMM-NEXT:    b .LBB24_1
-; CHECK-DSP-IMM-NEXT:  .LBB24_2: @ %if.end
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i16, i16* @c, align 2
-  %1 = load i16, i16* @b, align 2
-  %conv = sext i16 %1 to i32
-  store i32 %conv, i32* @f, align 4
-  %2 = trunc i16 %1 to i8
-  %conv1 = and i8 %2, 1
-  store i8 %conv1, i8* @e, align 1
-  %3 = load i8, i8* @a, align 1
-  %narrow = mul nuw i8 %3, %conv1
-  %mul = zext i8 %narrow to i32
-  store i32 %mul, i32* @d, align 4
-  %4 = zext i8 %narrow to i16
-  %conv5 = or i16 %0, %4
-  %tobool = icmp eq i16 %conv5, 0
-  br i1 %tobool, label %if.end, label %for.cond
-
-for.cond:
-  br label %for.cond
-
-if.end:
-  ret void
-}
-
-define void @zext_urem_trunc() {
-; CHECK-NODSP-V8-LABEL: zext_urem_trunc:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    movw r0, :lower16:c
-; CHECK-NODSP-V8-NEXT:    movt r0, :upper16:c
-; CHECK-NODSP-V8-NEXT:    ldrh r1, [r0]
-; CHECK-NODSP-V8-NEXT:    movw r0, :lower16:e
-; CHECK-NODSP-V8-NEXT:    movt r0, :upper16:e
-; CHECK-NODSP-V8-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V8-NEXT:    cbz r1, .LBB25_2
-; CHECK-NODSP-V8-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-NODSP-V8-NEXT:    udiv r2, r0, r1
-; CHECK-NODSP-V8-NEXT:    mls r0, r2, r1, r0
-; CHECK-NODSP-V8-NEXT:  .LBB25_2: @ %cond.end
-; CHECK-NODSP-V8-NEXT:    movw r1, :lower16:a
-; CHECK-NODSP-V8-NEXT:    movt r1, :upper16:a
-; CHECK-NODSP-V8-NEXT:    strb r0, [r1]
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: zext_urem_trunc:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r7, lr}
-; CHECK-NODSP-V7-NEXT:    push {r7, lr}
-; CHECK-NODSP-V7-NEXT:    movw r0, :lower16:e
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:c
-; CHECK-NODSP-V7-NEXT:    movt r0, :upper16:e
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:c
-; CHECK-NODSP-V7-NEXT:    ldrh r1, [r1]
-; CHECK-NODSP-V7-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V7-NEXT:    cbz r1, .LBB25_2
-; CHECK-NODSP-V7-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-NODSP-V7-NEXT:    bl __aeabi_uidivmod
-; CHECK-NODSP-V7-NEXT:    mov r0, r1
-; CHECK-NODSP-V7-NEXT:  .LBB25_2: @ %cond.end
-; CHECK-NODSP-V7-NEXT:    movw r1, :lower16:a
-; CHECK-NODSP-V7-NEXT:    movt r1, :upper16:a
-; CHECK-NODSP-V7-NEXT:    strb r0, [r1]
-; CHECK-NODSP-V7-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-LABEL: zext_urem_trunc:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    movw r1, :lower16:c
-; CHECK-DSP-NEXT:    movw r0, :lower16:e
-; CHECK-DSP-NEXT:    movt r1, :upper16:c
-; CHECK-DSP-NEXT:    movt r0, :upper16:e
-; CHECK-DSP-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-NEXT:    cmp r1, #0
-; CHECK-DSP-NEXT:    itt ne
-; CHECK-DSP-NEXT:    udivne r2, r0, r1
-; CHECK-DSP-NEXT:    mlsne r0, r2, r1, r0
-; CHECK-DSP-NEXT:    movw r1, :lower16:a
-; CHECK-DSP-NEXT:    movt r1, :upper16:a
-; CHECK-DSP-NEXT:    strb r0, [r1]
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: zext_urem_trunc:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    movw r0, :lower16:e
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:c
-; CHECK-DSP-IMM-NEXT:    movt r0, :upper16:e
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:c
-; CHECK-DSP-IMM-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-IMM-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-IMM-NEXT:    cbz r1, .LBB25_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-DSP-IMM-NEXT:    udiv r2, r0, r1
-; CHECK-DSP-IMM-NEXT:    mls r0, r2, r1, r0
-; CHECK-DSP-IMM-NEXT:  .LBB25_2: @ %cond.end
-; CHECK-DSP-IMM-NEXT:    movw r1, :lower16:a
-; CHECK-DSP-IMM-NEXT:    movt r1, :upper16:a
-; CHECK-DSP-IMM-NEXT:    strb r0, [r1]
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i16, i16* @c, align 2
-  %cmp = icmp eq i16 %0, 0
-  %1 = load i8, i8* @e, align 1
-  br i1 %cmp, label %cond.end, label %cond.false
-
-cond.false:
-  %rem.lhs.trunc = zext i8 %1 to i16
-  %rem7 = urem i16 %rem.lhs.trunc, %0
-  %rem.zext = trunc i16 %rem7 to i8
-  br label %cond.end
-
-cond.end:
-  %cond = phi i8 [ %rem.zext, %cond.false ], [ %1, %entry ]
-  store i8 %cond, i8* @a, align 1
-  ret void
-}
-
-define i1 @dont_replace_trunc_1(i8* %a, i16* %b, i16* %c, i32* %d, i8* %e, i32* %f) {
-; CHECK-NODSP-V8-LABEL: dont_replace_trunc_1:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    push {r4, lr}
-; CHECK-NODSP-V8-NEXT:    ldrh r1, [r1]
-; CHECK-NODSP-V8-NEXT:    ldrd r12, lr, [sp, #8]
-; CHECK-NODSP-V8-NEXT:    sxth r4, r1
-; CHECK-NODSP-V8-NEXT:    and r1, r1, #1
-; CHECK-NODSP-V8-NEXT:    ldrh r2, [r2]
-; CHECK-NODSP-V8-NEXT:    str.w r4, [lr]
-; CHECK-NODSP-V8-NEXT:    strb.w r1, [r12]
-; CHECK-NODSP-V8-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V8-NEXT:    muls r0, r1, r0
-; CHECK-NODSP-V8-NEXT:    uxtb r1, r0
-; CHECK-NODSP-V8-NEXT:    orr.w r0, r2, r1
-; CHECK-NODSP-V8-NEXT:    uxth r0, r0
-; CHECK-NODSP-V8-NEXT:    clz r0, r0
-; CHECK-NODSP-V8-NEXT:    lsrs r0, r0, #5
-; CHECK-NODSP-V8-NEXT:    str r1, [r3]
-; CHECK-NODSP-V8-NEXT:    pop {r4, pc}
-;
-; CHECK-NODSP-V7-LABEL: dont_replace_trunc_1:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r4, lr}
-; CHECK-NODSP-V7-NEXT:    push {r4, lr}
-; CHECK-NODSP-V7-NEXT:    ldrh r1, [r1]
-; CHECK-NODSP-V7-NEXT:    ldrd lr, r12, [sp, #8]
-; CHECK-NODSP-V7-NEXT:    ldrh r2, [r2]
-; CHECK-NODSP-V7-NEXT:    sxth r4, r1
-; CHECK-NODSP-V7-NEXT:    and r1, r1, #1
-; CHECK-NODSP-V7-NEXT:    str.w r4, [r12]
-; CHECK-NODSP-V7-NEXT:    strb.w r1, [lr]
-; CHECK-NODSP-V7-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V7-NEXT:    muls r0, r1, r0
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r0
-; CHECK-NODSP-V7-NEXT:    str r0, [r3]
-; CHECK-NODSP-V7-NEXT:    orrs r0, r2
-; CHECK-NODSP-V7-NEXT:    uxth r0, r0
-; CHECK-NODSP-V7-NEXT:    clz r0, r0
-; CHECK-NODSP-V7-NEXT:    lsrs r0, r0, #5
-; CHECK-NODSP-V7-NEXT:    pop {r4, pc}
-;
-; CHECK-DSP-LABEL: dont_replace_trunc_1:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    push {r7, lr}
-; CHECK-DSP-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-NEXT:    ldrh.w r12, [r2]
-; CHECK-DSP-NEXT:    ldr r2, [sp, #12]
-; CHECK-DSP-NEXT:    sxth.w lr, r1
-; CHECK-DSP-NEXT:    and r1, r1, #1
-; CHECK-DSP-NEXT:    str.w lr, [r2]
-; CHECK-DSP-NEXT:    ldr r2, [sp, #8]
-; CHECK-DSP-NEXT:    strb r1, [r2]
-; CHECK-DSP-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-NEXT:    muls r0, r1, r0
-; CHECK-DSP-NEXT:    uxtb r0, r0
-; CHECK-DSP-NEXT:    str r0, [r3]
-; CHECK-DSP-NEXT:    orr.w r0, r0, r12
-; CHECK-DSP-NEXT:    uxth r0, r0
-; CHECK-DSP-NEXT:    clz r0, r0
-; CHECK-DSP-NEXT:    lsrs r0, r0, #5
-; CHECK-DSP-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-IMM-LABEL: dont_replace_trunc_1:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    push {r4, lr}
-; CHECK-DSP-IMM-NEXT:    ldrd lr, r12, [sp, #8]
-; CHECK-DSP-IMM-NEXT:    ldrh r1, [r1]
-; CHECK-DSP-IMM-NEXT:    ldrh r2, [r2]
-; CHECK-DSP-IMM-NEXT:    sxth r4, r1
-; CHECK-DSP-IMM-NEXT:    str.w r4, [r12]
-; CHECK-DSP-IMM-NEXT:    and r1, r1, #1
-; CHECK-DSP-IMM-NEXT:    strb.w r1, [lr]
-; CHECK-DSP-IMM-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-IMM-NEXT:    muls r0, r1, r0
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r0
-; CHECK-DSP-IMM-NEXT:    str r0, [r3]
-; CHECK-DSP-IMM-NEXT:    orrs r0, r2
-; CHECK-DSP-IMM-NEXT:    uxth r0, r0
-; CHECK-DSP-IMM-NEXT:    clz r0, r0
-; CHECK-DSP-IMM-NEXT:    lsrs r0, r0, #5
-; CHECK-DSP-IMM-NEXT:    pop {r4, pc}
-entry:
-  %0 = load i16, i16* %c, align 2
-  %1 = load i16, i16* %b, align 2
-  %conv = sext i16 %1 to i32
-  store i32 %conv, i32* %f, align 4
-  %2 = trunc i16 %1 to i8
-  %conv1 = and i8 %2, 1
-  store i8 %conv1, i8* %e, align 1
-  %3 = load i8, i8* %a, align 1
-  %narrow = mul nuw i8 %3, %conv1
-  %mul = zext i8 %narrow to i32
-  store i32 %mul, i32* %d, align 4
-  %4 = zext i8 %narrow to i16
-  %conv5 = or i16 %0, %4
-  %tobool = icmp eq i16 %conv5, 0
-  ret i1 %tobool
-}
-
-define i32 @dont_replace_trunc_2(i16* %a, i8* %b) {
-; CHECK-NODSP-V8-LABEL: dont_replace_trunc_2:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    cmp r0, #8
-; CHECK-NODSP-V8-NEXT:    it ls
-; CHECK-NODSP-V8-NEXT:    movls r0, #0
-; CHECK-NODSP-V8-NEXT:    ldrb r2, [r1]
-; CHECK-NODSP-V8-NEXT:    uxtb r0, r0
-; CHECK-NODSP-V8-NEXT:    orrs r0, r2
-; CHECK-NODSP-V8-NEXT:    strb r0, [r1]
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: dont_replace_trunc_2:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V7-NEXT:    ldrb r2, [r1]
-; CHECK-NODSP-V7-NEXT:    cmp r0, #8
-; CHECK-NODSP-V7-NEXT:    it ls
-; CHECK-NODSP-V7-NEXT:    movls r0, #0
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r0
-; CHECK-NODSP-V7-NEXT:    orrs r0, r2
-; CHECK-NODSP-V7-NEXT:    strb r0, [r1]
-; CHECK-NODSP-V7-NEXT:    bx lr
-;
-; CHECK-DSP-LABEL: dont_replace_trunc_2:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    cmp r0, #8
-; CHECK-DSP-NEXT:    it ls
-; CHECK-DSP-NEXT:    movls r0, #0
-; CHECK-DSP-NEXT:    ldrb r2, [r1]
-; CHECK-DSP-NEXT:    uxtb r0, r0
-; CHECK-DSP-NEXT:    orrs r0, r2
-; CHECK-DSP-NEXT:    strb r0, [r1]
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: dont_replace_trunc_2:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    movs r2, #0
-; CHECK-DSP-IMM-NEXT:    ldrb r3, [r1]
-; CHECK-DSP-IMM-NEXT:    cmp r0, #8
-; CHECK-DSP-IMM-NEXT:    it hi
-; CHECK-DSP-IMM-NEXT:    movhi r2, r0
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r2
-; CHECK-DSP-IMM-NEXT:    orrs r0, r3
-; CHECK-DSP-IMM-NEXT:    strb r0, [r1]
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i16, i16* %a, align 2
-  %cmp = icmp ugt i16 %0, 8
-  %narrow = select i1 %cmp, i16 %0, i16 0
-  %cond = trunc i16 %narrow to i8
-  %1 = load i8, i8* %b, align 1
-  %or = or i8 %1, %cond
-  store i8 %or, i8* %b, align 1
-  %conv5 = zext i8 %or to i32
-  ret i32 %conv5
-}
-
-define i32 @replace_trunk_with_mask(i16* %a) {
-; CHECK-NODSP-V8-LABEL: replace_trunk_with_mask:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    cmp r0, #0
-; CHECK-NODSP-V8-NEXT:    itt eq
-; CHECK-NODSP-V8-NEXT:    moveq r0, #0
-; CHECK-NODSP-V8-NEXT:    bxeq lr
-; CHECK-NODSP-V8-NEXT:    movw r1, #535
-; CHECK-NODSP-V8-NEXT:    udiv r2, r1, r0
-; CHECK-NODSP-V8-NEXT:    mls r0, r2, r0, r1
-; CHECK-NODSP-V8-NEXT:    movw r1, #43691
-; CHECK-NODSP-V8-NEXT:    uxtb r0, r0
-; CHECK-NODSP-V8-NEXT:    movt r1, #43690
-; CHECK-NODSP-V8-NEXT:    umull r0, r1, r0, r1
-; CHECK-NODSP-V8-NEXT:    lsrs r0, r1, #1
-; CHECK-NODSP-V8-NEXT:    bx lr
-;
-; CHECK-NODSP-V7-LABEL: replace_trunk_with_mask:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    .save {r7, lr}
-; CHECK-NODSP-V7-NEXT:    push {r7, lr}
-; CHECK-NODSP-V7-NEXT:    ldrh r1, [r0]
-; CHECK-NODSP-V7-NEXT:    cbz r1, .LBB28_2
-; CHECK-NODSP-V7-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-NODSP-V7-NEXT:    movw r0, #535
-; CHECK-NODSP-V7-NEXT:    bl __aeabi_uidivmod
-; CHECK-NODSP-V7-NEXT:    uxtb r0, r1
-; CHECK-NODSP-V7-NEXT:    movw r1, #43691
-; CHECK-NODSP-V7-NEXT:    movt r1, #43690
-; CHECK-NODSP-V7-NEXT:    umull r0, r1, r0, r1
-; CHECK-NODSP-V7-NEXT:    lsrs r0, r1, #1
-; CHECK-NODSP-V7-NEXT:    pop {r7, pc}
-; CHECK-NODSP-V7-NEXT:  .LBB28_2:
-; CHECK-NODSP-V7-NEXT:    movs r0, #0
-; CHECK-NODSP-V7-NEXT:    pop {r7, pc}
-;
-; CHECK-DSP-LABEL: replace_trunk_with_mask:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    cmp r0, #0
-; CHECK-DSP-NEXT:    itt eq
-; CHECK-DSP-NEXT:    moveq r0, #0
-; CHECK-DSP-NEXT:    bxeq lr
-; CHECK-DSP-NEXT:    movw r1, #535
-; CHECK-DSP-NEXT:    udiv r2, r1, r0
-; CHECK-DSP-NEXT:    mls r0, r2, r0, r1
-; CHECK-DSP-NEXT:    movw r1, #43691
-; CHECK-DSP-NEXT:    uxtb r0, r0
-; CHECK-DSP-NEXT:    movt r1, #43690
-; CHECK-DSP-NEXT:    umull r0, r1, r0, r1
-; CHECK-DSP-NEXT:    lsrs r0, r1, #1
-; CHECK-DSP-NEXT:    bx lr
-;
-; CHECK-DSP-IMM-LABEL: replace_trunk_with_mask:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    cbz r0, .LBB28_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1: @ %cond.false
-; CHECK-DSP-IMM-NEXT:    movw r1, #535
-; CHECK-DSP-IMM-NEXT:    udiv r2, r1, r0
-; CHECK-DSP-IMM-NEXT:    mls r0, r2, r0, r1
-; CHECK-DSP-IMM-NEXT:    movw r1, #43691
-; CHECK-DSP-IMM-NEXT:    movt r1, #43690
-; CHECK-DSP-IMM-NEXT:    uxtb r0, r0
-; CHECK-DSP-IMM-NEXT:    umull r0, r1, r0, r1
-; CHECK-DSP-IMM-NEXT:    lsrs r0, r1, #1
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB28_2:
-; CHECK-DSP-IMM-NEXT:    movs r0, #0
-; CHECK-DSP-IMM-NEXT:    bx lr
-entry:
-  %0 = load i16, i16* %a
-  %cmp = icmp eq i16 %0, 0
-  br i1 %cmp, label %cond.end, label %cond.false
-
-cond.false:
-  %1 = urem i16 535, %0
-  %.lhs.trunc = trunc i16 %1 to i8
-  %2 = udiv i8 %.lhs.trunc, 3
-  %phitmp = zext i8 %2 to i32
-  br label %cond.end
-
-cond.end:
-  %cond = phi i32 [ %phitmp, %cond.false ], [ 0, %entry ]
-  ret i32 %cond
-}
-
-define float @test_i8_sitofp(i8* %ptr, i8 %arg) {
-; CHECK-NODSP-V8-LABEL: test_i8_sitofp:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V8-NEXT:    uxtb r2, r1
-; CHECK-NODSP-V8-NEXT:    cmp r0, r2
-; CHECK-NODSP-V8-NEXT:    bne .LBB29_2
-; CHECK-NODSP-V8-NEXT:  @ %bb.1:
-; CHECK-NODSP-V8-NEXT:    vldr s0, .LCPI29_0
-; CHECK-NODSP-V8-NEXT:    vmov r0, s0
-; CHECK-NODSP-V8-NEXT:    bx lr
-; CHECK-NODSP-V8-NEXT:  .LBB29_2: @ %if.end
-; CHECK-NODSP-V8-NEXT:    sxtb r0, r1
-; CHECK-NODSP-V8-NEXT:    vmov s0, r0
-; CHECK-NODSP-V8-NEXT:    vcvt.f32.s32 s0, s0
-; CHECK-NODSP-V8-NEXT:    vmov.f32 s2, #2.000000e+01
-; CHECK-NODSP-V8-NEXT:    vdiv.f32 s0, s0, s2
-; CHECK-NODSP-V8-NEXT:    vmov r0, s0
-; CHECK-NODSP-V8-NEXT:    bx lr
-; CHECK-NODSP-V8-NEXT:    .p2align 2
-; CHECK-NODSP-V8-NEXT:  @ %bb.3:
-; CHECK-NODSP-V8-NEXT:  .LCPI29_0:
-; CHECK-NODSP-V8-NEXT:    .long 0 @ float 0
-;
-; CHECK-NODSP-V7-LABEL: test_i8_sitofp:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrb r0, [r0]
-; CHECK-NODSP-V7-NEXT:    uxtb r2, r1
-; CHECK-NODSP-V7-NEXT:    cmp r0, r2
-; CHECK-NODSP-V7-NEXT:    ittt eq
-; CHECK-NODSP-V7-NEXT:    vldreq s0, .LCPI29_0
-; CHECK-NODSP-V7-NEXT:    vmoveq r0, s0
-; CHECK-NODSP-V7-NEXT:    bxeq lr
-; CHECK-NODSP-V7-NEXT:    sxtb r0, r1
-; CHECK-NODSP-V7-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-NODSP-V7-NEXT:    vmov s2, r0
-; CHECK-NODSP-V7-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-NODSP-V7-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-NODSP-V7-NEXT:    vmov r0, s0
-; CHECK-NODSP-V7-NEXT:    bx lr
-; CHECK-NODSP-V7-NEXT:    .p2align 2
-; CHECK-NODSP-V7-NEXT:  @ %bb.1:
-; CHECK-NODSP-V7-NEXT:  .LCPI29_0:
-; CHECK-NODSP-V7-NEXT:    .long 0 @ float 0
-;
-; CHECK-DSP-LABEL: test_i8_sitofp:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-NEXT:    uxtb r2, r1
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    ittt eq
-; CHECK-DSP-NEXT:    vldreq s0, .LCPI29_0
-; CHECK-DSP-NEXT:    vmoveq r0, s0
-; CHECK-DSP-NEXT:    bxeq lr
-; CHECK-DSP-NEXT:    sxtb r0, r1
-; CHECK-DSP-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-DSP-NEXT:    vmov s2, r0
-; CHECK-DSP-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-DSP-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-DSP-NEXT:    vmov r0, s0
-; CHECK-DSP-NEXT:    bx lr
-; CHECK-DSP-NEXT:    .p2align 2
-; CHECK-DSP-NEXT:  @ %bb.1:
-; CHECK-DSP-NEXT:  .LCPI29_0:
-; CHECK-DSP-NEXT:    .long 0 @ float 0
-;
-; CHECK-DSP-IMM-LABEL: test_i8_sitofp:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrb r0, [r0]
-; CHECK-DSP-IMM-NEXT:    uxtb r2, r1
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    bne .LBB29_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1:
-; CHECK-DSP-IMM-NEXT:    vldr s0, .LCPI29_0
-; CHECK-DSP-IMM-NEXT:    vmov r0, s0
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB29_2: @ %if.end
-; CHECK-DSP-IMM-NEXT:    sxtb r0, r1
-; CHECK-DSP-IMM-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-DSP-IMM-NEXT:    vmov s2, r0
-; CHECK-DSP-IMM-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-DSP-IMM-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-DSP-IMM-NEXT:    vmov r0, s0
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:    .p2align 2
-; CHECK-DSP-IMM-NEXT:  @ %bb.3:
-; CHECK-DSP-IMM-NEXT:  .LCPI29_0:
-; CHECK-DSP-IMM-NEXT:    .long 0 @ float 0
-entry:
-  %0 = load i8, i8* %ptr, align 1
-   %cmp = icmp eq i8 %0, %arg
-   br i1 %cmp, label %exit, label %if.end
-
-if.end:
-  %conv = sitofp i8 %arg to float
-  %div = fdiv float %conv, 2.000000e+01
-  br label %exit
-
-exit:
-  %res = phi float [ 0.0, %entry ], [ %div, %if.end ]
-  ret float %res
-}
-
-define float @test_i16_sitofp(i16* %ptr, i16 %arg) {
-; CHECK-NODSP-V8-LABEL: test_i16_sitofp:
-; CHECK-NODSP-V8:       @ %bb.0: @ %entry
-; CHECK-NODSP-V8-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V8-NEXT:    uxth r2, r1
-; CHECK-NODSP-V8-NEXT:    cmp r0, r2
-; CHECK-NODSP-V8-NEXT:    bne .LBB30_2
-; CHECK-NODSP-V8-NEXT:  @ %bb.1:
-; CHECK-NODSP-V8-NEXT:    vldr s0, .LCPI30_0
-; CHECK-NODSP-V8-NEXT:    vmov r0, s0
-; CHECK-NODSP-V8-NEXT:    bx lr
-; CHECK-NODSP-V8-NEXT:  .LBB30_2: @ %if.end
-; CHECK-NODSP-V8-NEXT:    sxth r0, r1
-; CHECK-NODSP-V8-NEXT:    vmov s0, r0
-; CHECK-NODSP-V8-NEXT:    vcvt.f32.s32 s0, s0
-; CHECK-NODSP-V8-NEXT:    vmov.f32 s2, #2.000000e+01
-; CHECK-NODSP-V8-NEXT:    vdiv.f32 s0, s0, s2
-; CHECK-NODSP-V8-NEXT:    vmov r0, s0
-; CHECK-NODSP-V8-NEXT:    bx lr
-; CHECK-NODSP-V8-NEXT:    .p2align 2
-; CHECK-NODSP-V8-NEXT:  @ %bb.3:
-; CHECK-NODSP-V8-NEXT:  .LCPI30_0:
-; CHECK-NODSP-V8-NEXT:    .long 0 @ float 0
-;
-; CHECK-NODSP-V7-LABEL: test_i16_sitofp:
-; CHECK-NODSP-V7:       @ %bb.0: @ %entry
-; CHECK-NODSP-V7-NEXT:    ldrh r0, [r0]
-; CHECK-NODSP-V7-NEXT:    uxth r2, r1
-; CHECK-NODSP-V7-NEXT:    cmp r0, r2
-; CHECK-NODSP-V7-NEXT:    ittt eq
-; CHECK-NODSP-V7-NEXT:    vldreq s0, .LCPI30_0
-; CHECK-NODSP-V7-NEXT:    vmoveq r0, s0
-; CHECK-NODSP-V7-NEXT:    bxeq lr
-; CHECK-NODSP-V7-NEXT:    sxth r0, r1
-; CHECK-NODSP-V7-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-NODSP-V7-NEXT:    vmov s2, r0
-; CHECK-NODSP-V7-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-NODSP-V7-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-NODSP-V7-NEXT:    vmov r0, s0
-; CHECK-NODSP-V7-NEXT:    bx lr
-; CHECK-NODSP-V7-NEXT:    .p2align 2
-; CHECK-NODSP-V7-NEXT:  @ %bb.1:
-; CHECK-NODSP-V7-NEXT:  .LCPI30_0:
-; CHECK-NODSP-V7-NEXT:    .long 0 @ float 0
-;
-; CHECK-DSP-LABEL: test_i16_sitofp:
-; CHECK-DSP:       @ %bb.0: @ %entry
-; CHECK-DSP-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-NEXT:    uxth r2, r1
-; CHECK-DSP-NEXT:    cmp r0, r2
-; CHECK-DSP-NEXT:    ittt eq
-; CHECK-DSP-NEXT:    vldreq s0, .LCPI30_0
-; CHECK-DSP-NEXT:    vmoveq r0, s0
-; CHECK-DSP-NEXT:    bxeq lr
-; CHECK-DSP-NEXT:    sxth r0, r1
-; CHECK-DSP-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-DSP-NEXT:    vmov s2, r0
-; CHECK-DSP-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-DSP-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-DSP-NEXT:    vmov r0, s0
-; CHECK-DSP-NEXT:    bx lr
-; CHECK-DSP-NEXT:    .p2align 2
-; CHECK-DSP-NEXT:  @ %bb.1:
-; CHECK-DSP-NEXT:  .LCPI30_0:
-; CHECK-DSP-NEXT:    .long 0 @ float 0
-;
-; CHECK-DSP-IMM-LABEL: test_i16_sitofp:
-; CHECK-DSP-IMM:       @ %bb.0: @ %entry
-; CHECK-DSP-IMM-NEXT:    ldrh r0, [r0]
-; CHECK-DSP-IMM-NEXT:    uxth r2, r1
-; CHECK-DSP-IMM-NEXT:    cmp r0, r2
-; CHECK-DSP-IMM-NEXT:    bne .LBB30_2
-; CHECK-DSP-IMM-NEXT:  @ %bb.1:
-; CHECK-DSP-IMM-NEXT:    vldr s0, .LCPI30_0
-; CHECK-DSP-IMM-NEXT:    vmov r0, s0
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:  .LBB30_2: @ %if.end
-; CHECK-DSP-IMM-NEXT:    sxth r0, r1
-; CHECK-DSP-IMM-NEXT:    vmov.f32 s0, #2.000000e+01
-; CHECK-DSP-IMM-NEXT:    vmov s2, r0
-; CHECK-DSP-IMM-NEXT:    vcvt.f32.s32 s2, s2
-; CHECK-DSP-IMM-NEXT:    vdiv.f32 s0, s2, s0
-; CHECK-DSP-IMM-NEXT:    vmov r0, s0
-; CHECK-DSP-IMM-NEXT:    bx lr
-; CHECK-DSP-IMM-NEXT:    .p2align 2
-; CHECK-DSP-IMM-NEXT:  @ %bb.3:
-; CHECK-DSP-IMM-NEXT:  .LCPI30_0:
-; CHECK-DSP-IMM-NEXT:    .long 0 @ float 0
-entry:
-  %0 = load i16, i16* %ptr, align 1
-   %cmp = icmp eq i16 %0, %arg
-   br i1 %cmp, label %exit, label %if.end
-
-if.end:
-  %conv = sitofp i16 %arg to float
-  %div = fdiv float %conv, 2.000000e+01
-  br label %exit
-
-exit:
-  %res = phi float [ 0.0, %entry ], [ %div, %if.end ]
-  ret float %res
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
deleted file mode 100644
index 76c9746c35566..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-icmps.ll
+++ /dev/null
@@ -1,332 +0,0 @@
-; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 %s -arm-disable-cgp=false -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
-; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
-; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
-
-; CHECK-COMMON-LABEL: test_ult_254_inc_imm:
-; CHECK-DSP:        adds    r0, #1
-; CHECK-DSP-NEXT:   uxtb    r1, r0
-; CHECK-DSP-NEXT:   movs    r0, #47
-; CHECK-DSP-NEXT:   cmp     r1, #254
-; CHECK-DSP-NEXT:   it      lo
-; CHECK-DSP-NEXT:   movlo   r0, #35
-
-; CHECK-DSP-IMM:      movs r1, #1
-; CHECK-DSP-IMM-NEXT: uadd8 r1, r0, r1
-; CHECK-DSP-IMM-NEXT: movs  r0, #47
-; CHECK-DSP-IMM-NEXT: cmp r1, #254
-; CHECK-DSP-IMM-NEXT: it  lo
-; CHECK-DSP-IMM-NEXT: movlo r0, #35
-define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
-entry:
-  %add = add i8 %x, 1
-  %cmp = icmp ult i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_slt_254_inc_imm
-; CHECK-COMMON: adds
-; CHECK-COMMON: sxtb
-define i32 @test_slt_254_inc_imm(i8 signext %x) {
-entry:
-  %add = add i8 %x, 1
-  %cmp = icmp slt i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_ult_254_inc_var:
-; CHECK-NODSP:      add     r0, r1
-; CHECK-NODSP-NEXT: uxtb    r1, r0
-; CHECK-NODSP-NEXT: movs    r0, #47
-; CHECK-NODSP-NEXT: cmp     r1, #254
-; CHECK-NODSP-NEXT: it      lo
-; CHECK-NODSP-NEXT: movlo   r0, #35
-
-; CHECK-DSP:        uadd8   r1, r0, r1
-; CHECK-DSP-NEXT:   movs    r0, #47
-; CHECK-DSP-NEXT:   cmp     r1, #254
-; CHECK-DSP-NEXT:   it      lo
-; CHECK-DSP-NEXT:   movlo   r0, #35
-define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) {
-entry:
-  %add = add i8 %x, %y
-  %cmp = icmp ult i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_sle_254_inc_var
-; CHECK-COMMON: add
-; CHECK-COMMON: sxtb
-; CHECK-COMMON: cmp
-define i32 @test_sle_254_inc_var(i8 %x, i8 %y) {
-entry:
-  %add = add i8 %x, %y
-  %cmp = icmp sle i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_ugt_1_dec_imm:
-; CHECK-COMMON:      subs    r1, r0, #1
-; CHECK-COMMON-NEXT: movs    r0, #47
-; CHECK-COMMON-NEXT: cmp     r1, #1
-; CHECK-COMMON-NEXT: it      hi
-; CHECK-COMMON-NEXT: movhi   r0, #35
-define i32 @test_ugt_1_dec_imm(i8 zeroext %x) {
-entry:
-  %add = add i8 %x, -1
-  %cmp = icmp ugt i8 %add, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_sgt_1_dec_imm
-; CHECK-COMMON: subs
-; CHECK-COMMON: sxtb
-; CHECK-COMMON: cmp
-define i32 @test_sgt_1_dec_imm(i8 %x) {
-entry:
-  %add = add i8 %x, -1
-  %cmp = icmp sgt i8 %add, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_ugt_1_dec_var:
-; CHECK-NODSP:      subs    r0, r0, r1
-; CHECK-NODSP-NEXT: uxtb    r1, r0
-; CHECK-NODSP-NEXT: movs    r0, #47
-; CHECK-NODSP-NEXT: cmp     r1, #1
-; CHECK-NODSP-NEXT: it      hi
-; CHECK-NODSP-NEXT: movhi   r0, #35
-
-; CHECK-DSP:      usub8   r1, r0, r1
-; CHECK-DSP-NEXT: movs    r0, #47
-; CHECK-DSP-NEXT: cmp     r1, #1
-; CHECK-DSP-NEXT: it      hi
-; CHECK-DSP-NEXT: movhi   r0, #35
-define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
-entry:
-  %sub = sub i8 %x, %y
-  %cmp = icmp ugt i8 %sub, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: test_sge_1_dec_var
-; CHECK-COMMON: sub
-; CHECK-COMMON: sxtb
-; CHECK-COMMON: cmp
-define i32 @test_sge_1_dec_var(i8 %x, i8 %y) {
-entry:
-  %sub = sub i8 %x, %y
-  %cmp = icmp sge i8 %sub, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: dsp_imm1:
-; CHECK-DSP:      eors    r1, r0
-; CHECK-DSP-NEXT: and     r0, r0, #7
-; CHECK-DSP-NEXT: subs    r0, r0, r1
-; CHECK-DSP-NEXT: adds    r0, #1
-; CHECK-DSP-NEXT: uxtb    r1, r0
-; CHECK-DSP-NEXT: movs    r0, #47
-; CHECK-DSP-NEXT: cmp     r1, #254
-; CHECK-DSP-NEXT: it      lo
-; CHECK-DSP-NEXT: movlo   r0, #35
-
-; CHECK-DSP-IMM:      eors    r1, r0
-; CHECK-DSP-IMM-NEXT: and     r0, r0, #7
-; CHECK-DSP-IMM-NEXT: usub8   r0, r0, r1
-; CHECK-DSP-IMM-NEXT: movs    r1, #1
-; CHECK-DSP-IMM-NEXT: uadd8   r1, r0, r1
-; CHECK-DSP-IMM-NEXT: movs    r0, #47
-; CHECK-DSP-IMM-NEXT: cmp     r1, #254
-; CHECK-DSP-IMM-NEXT: it      lo
-; CHECK-DSP-IMM-NEXT: movlo   r0, #35
-define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) {
-entry:
-  %xor = xor i8 %x, %y
-  %and = and i8 %x, 7
-  %sub = sub i8 %and, %xor
-  %add = add i8 %sub, 1
-  %cmp = icmp ult i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: dsp_var:
-; CHECK-COMMON:   eors    r1, r0
-; CHECK-COMMON:   and     r2, r0, #7
-; CHECK-NODSP:    subs    r1, r2, r1
-; CHECK-NODSP:    add.w   r0, r1, r0, lsl #1
-; CHECK-NODSP:    uxtb    r1, r0
-; CHECK-DSP:      usub8   r1, r2, r1
-; CHECK-DSP:      lsls    r0, r0, #1
-; CHECK-DSP:      uadd8   r1, r1, r0
-; CHECK-DSP-NOT:  uxt
-; CHECK-COMMON:   movs    r0, #47
-; CHECK-COMMON:   cmp     r1, #254
-; CHECK-COMMON:   it      lo
-; CHECK-COMMON:   movlo   r0, #35
-define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) {
-  %xor = xor i8 %x, %y
-  %and = and i8 %x, 7
-  %sub = sub i8 %and, %xor
-  %mul = shl nuw i8 %x, 1
-  %add = add i8 %sub, %mul
-  %cmp = icmp ult i8 %add, 254
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: store_dsp_res
-; CHECK-DSP: usub8 
-; CHECK-DSP: strb
-define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
-  %first = getelementptr inbounds i8, i8* %in, i32 0
-  %second = getelementptr inbounds i8, i8* %in, i32 1
-  %ld0 = load i8, i8* %first
-  %ld1 = load i8, i8* %second
-  %xor = xor i8 %ld0, -1
-  %cmp = icmp ult i8 %compare, %ld1
-  %select = select i1 %cmp, i8 %compare, i8 %xor
-  %sub = sub i8 %ld0, %select
-  store i8 %sub, i8* %out, align 1
-  ret void
-}
-
-; CHECK-COMMON-LABEL: ugt_1_dec_imm:
-; CHECK-COMMON:      subs    r1, r0, #1
-; CHECK-COMMON-NEXT: movs    r0, #47
-; CHECK-COMMON-NEXT: cmp     r1, #1
-; CHECK-COMMON-NEXT: it      hi
-; CHECK-COMMON-NEXT: movhi   r0, #35
-define i32 @ugt_1_dec_imm(i8 zeroext %x) {
-entry:
-  %add = add i8 %x, -1
-  %cmp = icmp ugt i8 %add, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: ugt_1_dec_var:
-; CHECK-NODSP:      subs    r0, r0, r1
-; CHECK-NODSP-NEXT: uxtb    r1, r0
-; CHECK-NODSP-NEXT: movs    r0, #47
-; CHECK-NODSP-NEXT: cmp     r1, #1
-; CHECK-NODSP-NEXT: it      hi
-; CHECK-NODSP-NEXT: movhi   r0, #35
-
-; CHECK-DSP:      usub8   r1, r0, r1
-; CHECK-DSP-NEXT: movs    r0, #47
-; CHECK-DSP-NEXT: cmp     r1, #1
-; CHECK-DSP-NEXT: it      hi
-; CHECK-DSP-NEXT: movhi   r0, #35
-define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
-entry:
-  %sub = sub i8 %x, %y
-  %cmp = icmp ugt i8 %sub, 1
-  %res = select i1 %cmp, i32 35, i32 47
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: icmp_eq_minus_one
-; CHECK-COMMON: cmp {{r[0-9]+}}, #255
-define i32 @icmp_eq_minus_one(i8* %ptr) {
-  %load = load i8, i8* %ptr, align 1
-  %conv = zext i8 %load to i32
-  %cmp = icmp eq i8 %load, -1
-  %ret = select i1 %cmp, i32 %conv, i32 -1
-  ret i32 %ret
-}
-
-; CHECK-COMMON-LABEL: icmp_not
-; CHECK-COMMON: movw r2, #65535
-; CHECK-COMMON: eors r2, r0
-; CHECK-COMMON: movs r0, #32
-; CHECK-COMMON: cmp r2, r1
-define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
-  %not = xor i16 %arg0, -1
-  %cmp = icmp eq i16 %not, %arg1
-  %res = select i1 %cmp, i32 16, i32 32
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: icmp_i1
-; CHECK-NOT: uxt
-define i32 @icmp_i1(i1* %arg0, i1 zeroext %arg1, i32 %a, i32 %b) {
-entry:
-  %load = load i1, i1* %arg0
-  %not = xor i1 %load, 1
-  %cmp = icmp eq i1 %arg1, %not
-  %res = select i1 %cmp, i32 %a, i32 %b
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: icmp_i7
-; CHECK-COMMON: ldrb
-; CHECK-COMMON: cmp
-define i32 @icmp_i7(i7* %arg0, i7 zeroext %arg1, i32 %a, i32 %b) {
-entry:
-  %load = load i7, i7* %arg0
-  %add = add nuw i7 %load, 1
-  %cmp = icmp ult i7 %arg1, %add
-  %res = select i1 %cmp, i32 %a, i32 %b
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: icmp_i15
-; CHECK-COMMON: movw [[MINUS_ONE:r[0-9]+]], #32767
-define i32 @icmp_i15(i15 zeroext %arg0, i15 zeroext %arg1) {
-  %xor = xor i15 %arg0, -1
-  %cmp = icmp eq i15 %xor, %arg1
-  %res = select i1 %cmp, i32 21, i32 42
-  ret i32 %res
-}
-
-; CHECK-COMMON-LABEL: icmp_minus_imm
-; CHECK-NODSP: subs [[SUB:r[0-9]+]],
-; CHECK-NODSP: uxtb [[UXT:r[0-9]+]],
-; CHECK-NODSP: cmp [[UXT]], #251
-
-; CHECK-DSP: subs [[SUB:r[0-9]+]],
-; CHECK-DSP: uxtb [[UXT:r[0-9]+]],
-; CHECK-DSP: cmp [[UXT]], #251
-
-; CHECK-DSP-IMM: ldrb [[A:r[0-9]+]],
-; CHECK-DSP-IMM: movs  [[MINUS_7:r[0-9]+]], #249
-; CHECK-DSP-IMM: uadd8 [[RES:r[0-9]+]], [[A]], [[MINUS_7]]
-; CHECK-DSP-IMM: cmp [[RES]], #251
-define i32 @icmp_minus_imm(i8* %a) {
-entry:
-  %0 = load i8, i8* %a, align 1
-  %add.i = add i8 %0, -7
-  %cmp = icmp ugt i8 %add.i, -5
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; CHECK-COMMON-LABEL: mul_with_neg_imm
-; CHECK-COMMON-NOT: uxtb
-; CHECK-COMMON:     and [[BIT0:r[0-9]+]], r0, #1
-; CHECK-COMMON:     add.w [[MUL32:r[0-9]+]], [[BIT0]], [[BIT0]], lsl #5
-; CHECK-COMMON:     cmp.w r0, [[MUL32]], lsl #2
-define void @mul_with_neg_imm(i32, i32* %b) {
-entry:
-  %1 = trunc i32 %0 to i8
-  %2 = and i8 %1, 1
-  %conv.i = mul nuw i8 %2, -124
-  %tobool = icmp eq i8 %conv.i, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.then:
-  store i32 0, i32* %b, align 4
-  br label %if.end
-
-if.end:
-  ret void
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
deleted file mode 100644
index c446ddbdd07a7..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
+++ /dev/null
@@ -1,279 +0,0 @@
-; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -mattr=-use-misched %s -arm-disable-cgp=false -o - | FileCheck %s
-
-; CHECK: overflow_add
-; CHECK: add
-; CHECK: uxth
-; CHECK: cmp
-define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
-  %add = add i16 %a, %b
-  %or = or i16 %add, 1
-  %cmp = icmp ugt i16 %or, 1024
-  %res = select i1 %cmp, i16 2, i16 5
-  ret i16 %res
-}
-
-; CHECK-LABEL: overflow_sub
-; CHECK: sub
-; CHECK: uxth
-; CHECK: cmp
-define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
-  %add = sub i16 %a, %b
-  %or = or i16 %add, 1
-  %cmp = icmp ugt i16 %or, 1024
-  %res = select i1 %cmp, i16 2, i16 5
-  ret i16 %res
-}
-
-; CHECK-LABEL: overflow_mul
-; CHECK: mul
-; CHECK: uxth
-; CHECK: cmp
-define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
-  %add = mul i16 %a, %b
-  %or = or i16 %add, 1
-  %cmp = icmp ugt i16 %or, 1024
-  %res = select i1 %cmp, i16 2, i16 5
-  ret i16 %res
-}
-
-; CHECK-LABEL: overflow_shl
-; CHECK-COMMON: lsl
-; CHECK-COMMON: uxth
-; CHECK-COMMON: cmp
-define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
-  %add = shl i16 %a, %b
-  %or = or i16 %add, 1
-  %cmp = icmp ugt i16 %or, 1024
-  %res = select i1 %cmp, i16 2, i16 5
-  ret i16 %res
-}
-
-; CHECK-LABEL: overflow_add_no_consts:
-; CHECK:  add r0, r1
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], r2
-; CHECK:  movhi r0, #8
-define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
-  %add = add i8 %a, %b
-  %cmp = icmp ugt i8 %add, %limit
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: overflow_add_const_limit:
-; CHECK:  add r0, r1
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], #128
-; CHECK:  movhi r0, #8
-define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
-  %add = add i8 %a, %b
-  %cmp = icmp ugt i8 %add, 128
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: overflow_add_positive_const_limit:
-; CHECK:  adds r0, #1
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], #128
-; CHECK:  movhi r0, #8
-define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
-  %add = add i8 %a, 1
-  %cmp = icmp ugt i8 %add, 128
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: unsafe_add_underflow:
-; CHECK: movs	r1, #16
-; CHECK: cmp	r0, #1
-; CHECK: it	eq
-; CHECK: moveq	r1, #8
-; CHECK: mov	r0, r1
-define i32 @unsafe_add_underflow(i8 zeroext %a) {
-  %add = add i8 %a, -2
-  %cmp = icmp ugt i8 %add, 254
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: safe_add_underflow:
-; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #1
-; CHECK-NOT:  uxtb
-; CHECK:      cmp [[MINUS_1]], #254
-; CHECK:      movhi r0, #8
-define i32 @safe_add_underflow(i8 zeroext %a) {
-  %add = add i8 %a, -1
-  %cmp = icmp ugt i8 %add, 254
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: safe_add_underflow_neg:
-; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #2
-; CHECK-NOT:  uxtb
-; CHECK:      cmp [[MINUS_1]], #251
-; CHECK:      movlo r0, #8
-define i32 @safe_add_underflow_neg(i8 zeroext %a) {
-  %add = add i8 %a, -2
-  %cmp = icmp ule i8 %add, -6
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: overflow_sub_negative_const_limit:
-; CHECK:  adds r0, #1
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], #128
-; CHECK:  movhi r0, #8
-define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
-  %sub = sub i8 %a, -1
-  %cmp = icmp ugt i8 %sub, 128
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: unsafe_sub_underflow:
-; CHECK:  subs r0, #6
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], #250
-; CHECK:  movhi r0, #8
-define i32 @unsafe_sub_underflow(i8 zeroext %a) {
-  %sub = sub i8 %a, 6
-  %cmp = icmp ugt i8 %sub, 250
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: safe_sub_underflow:
-; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #1
-; CHECK-NOT:  uxtb
-; CHECK:      cmp [[MINUS_1]], #255
-; CHECK:      movlo r0, #8
-define i32 @safe_sub_underflow(i8 zeroext %a) {
-  %sub = sub i8 %a, 1
-  %cmp = icmp ule i8 %sub, 254
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: safe_sub_underflow_neg
-; CHECK:      subs [[MINUS_1:r[0-9]+]], r0, #4
-; CHECK-NOT:  uxtb
-; CHECK:      cmp [[MINUS_1]], #250
-; CHECK:      movhi r0, #8
-define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
-  %sub = sub i8 %a, 4
-  %cmp = icmp uge i8 %sub, -5
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK-LABEL: unsafe_sub_underflow_neg
-; CHECK:  subs r0, #4
-; CHECK:  uxtb [[EXT:r[0-9]+]], r0
-; CHECK:  cmp [[EXT]], #253
-; CHECK:  movlo r0, #8
-define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
-  %sub = sub i8 %a, 4
-  %cmp = icmp ult i8 %sub, -3
-  %res = select i1 %cmp, i32 8, i32 16
-  ret i32 %res
-}
-
-; CHECK:      rsb.w [[RSUB:r[0-9]+]], r0, #248
-; CHECK-NOT:  uxt
-; CHECK:      cmp [[RSUB]], #252
-define i32 @safe_sub_imm_var(i8* %b) {
-entry:
-  %0 = load i8, i8* %b, align 1
-  %sub = sub nuw nsw i8 -8, %0
-  %cmp = icmp ugt i8 %sub, 252
-  %conv4 = zext i1 %cmp to i32
-  ret i32 %conv4
-}
-
-; CHECK-LABEL: safe_sub_var_imm
-; CHECK:      sub.w [[ADD:r[0-9]+]], r0, #248
-; CHECK-NOT:  uxt
-; CHECK:      cmp [[ADD]], #252
-define i32 @safe_sub_var_imm(i8* %b) {
-entry:
-  %0 = load i8, i8* %b, align 1
-  %sub = sub nuw nsw i8 %0, -8
-  %cmp = icmp ugt i8 %sub, 252
-  %conv4 = zext i1 %cmp to i32
-  ret i32 %conv4
-}
-
-; CHECK-LABEL: safe_add_imm_var
-; CHECK:      add.w [[ADD:r[0-9]+]], r0, #129
-; CHECK-NOT:  uxt
-; CHECK:      cmp [[ADD]], #127
-define i32 @safe_add_imm_var(i8* %b) {
-entry:
-  %0 = load i8, i8* %b, align 1
-  %add = add nuw nsw i8 -127, %0
-  %cmp = icmp ugt i8 %add, 127
-  %conv4 = zext i1 %cmp to i32
-  ret i32 %conv4
-}
-
-; CHECK-LABEL: safe_add_var_imm
-; CHECK:      add.w [[SUB:r[0-9]+]], r0, #129
-; CHECK-NOT:  uxt
-; CHECK:      cmp [[SUB]], #127
-define i32 @safe_add_var_imm(i8* %b) {
-entry:
-  %0 = load i8, i8* %b, align 1
-  %add = add nuw nsw i8 %0, -127
-  %cmp = icmp ugt i8 %add, 127
-  %conv4 = zext i1 %cmp to i32
-  ret i32 %conv4
-}
-
-; CHECK-LABEL: convert_add_order
-; CHECK: orr{{.*}}, #1
-; CHECK: sub{{.*}}, #40
-; CHECK-NOT: uxt
-define i8 @convert_add_order(i8 zeroext %arg) {
-  %mask.0 = and i8 %arg, 1
-  %mask.1 = and i8 %arg, 2
-  %shl = or i8 %arg, 1
-  %add = add nuw i8 %shl, 10
-  %cmp.0 = icmp ult i8 %add, 60
-  %sub = add nsw i8 %shl, -40
-  %cmp.1 = icmp ult i8 %sub, 20
-  %mask.sel = select i1 %cmp.1, i8 %mask.0, i8 %mask.1
-  %res = select i1 %cmp.0, i8 %mask.sel, i8 %arg
-  ret i8 %res
-}
-
-; CHECK-LABEL: underflow_if_sub
-; CHECK: add{{.}} [[ADD:r[0-9]+]], #245
-; CHECK: cmp [[ADD]], r1
-define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
-  %cmp = icmp sgt i32 %arg, 0
-  %conv = zext i1 %cmp to i32
-  %and = and i32 %arg, %conv
-  %trunc = trunc i32 %and to i8
-  %conv1 = add nuw nsw i8 %trunc, -11
-  %cmp.1 = icmp ult i8 %conv1, %arg1
-  %res = select i1 %cmp.1, i8 %conv1, i8 100
-  ret i8 %res
-}
-
-; CHECK-LABEL: underflow_if_sub_signext
-; CHECK:      cmp r0, #0
-; CHECK-NEXT: uxtb  r1, r1
-; CHECK-NOT:  xtb
-define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
-  %cmp = icmp sgt i32 %arg, 0
-  %conv = zext i1 %cmp to i32
-  %and = and i32 %arg, %conv
-  %trunc = trunc i32 %and to i8
-  %conv1 = add nuw nsw i8 %trunc, -11
-  %cmp.1 = icmp ugt i8 %arg1, %conv1
-  %res = select i1 %cmp.1, i8 %conv1, i8 100
-  ret i8 %res
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
deleted file mode 100644
index 9b07a80e9a1c1..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-phis-ret.ll
+++ /dev/null
@@ -1,218 +0,0 @@
-; RUN: llc -mtriple=thumbv7m -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
-; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
-; RUN: llc -mtriple=thumbv8m.main -arm-disable-cgp=false -arm-enable-scalar-dsp=true -mcpu=cortex-m33 %s -o - | FileCheck %s --check-prefix=CHECK-COMMON
-; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON
-
-; Test that ARMCodeGenPrepare can handle:
-; - loops
-; - call operands
-; - call return values
-; - ret instructions
-; We use nuw on the arithmetic instructions to avoid complications.
-
-; Check that the arguments are extended but then nothing else is.
-; This also ensures that the pass can handle loops.
-; CHECK-COMMON-LABEL: phi_feeding_phi_args
-; CHECK-COMMON: uxtb
-; CHECK-COMMON: uxtb
-; CHECK-NOT: uxtb
-define void @phi_feeding_phi_args(i8 %a, i8 %b) {
-entry:
-  %0 = icmp ugt i8 %a, %b
-  br i1 %0, label %preheader, label %empty
-
-empty:
-  br label %preheader
-
-preheader:
-  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
-  br label %loop
-
-loop:
-  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
-  %cmp = icmp ult i8 %val, 254
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %inc = sub nuw i8 %val, 2
-  br label %if.end
-
-if.else:
-  %inc1 = shl nuw i8 %val, 1
-  br label %if.end
-
-if.end:
-  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
-  %cmp1 = icmp eq i8 %inc2, 255
-  br i1 %cmp1, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-; Same as above, but as the args are zeroext, we shouldn't see any uxts.
-; CHECK-COMMON-LABEL: phi_feeding_phi_zeroext_args
-; CHECK-COMMON-NOT: uxt
-define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
-entry:
-  %0 = icmp ugt i8 %a, %b
-  br i1 %0, label %preheader, label %empty
-
-empty:
-  br label %preheader
-
-preheader:
-  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
-  br label %loop
-
-loop:
-  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
-  %cmp = icmp ult i8 %val, 254
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %inc = sub nuw i8 %val, 2
-  br label %if.end
-
-if.else:
-  %inc1 = shl nuw i8 %val, 1
-  br label %if.end
-
-if.end:
-  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
-  %cmp1 = icmp eq i8 %inc2, 255
-  br i1 %cmp1, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-; Just check that phis also work with i16s.
-; CHECK-COMMON-LABEL: phi_i16:
-; CHECK-COMMON-NOT:   uxt
-define void @phi_i16() {
-entry:
-  br label %loop
-
-loop:
-  %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ]
-  %cmp = icmp ult i16 %val, 128
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %inc = add nuw i16 %val, 2
-  br label %if.end
-
-if.else:
-  %inc1 = add nuw i16 %val, 1
-  br label %if.end
-
-if.end:
-  %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
-  %cmp1 = icmp ult i16 %inc2, 253
-  br i1 %cmp1, label %loop, label %exit
-
-exit:
-  ret void
-}
-
-; CHECK-COMMON-LABEL: ret_i8
-; CHECK-COMMON-NOT:   uxt
-define i8 @ret_i8() {
-entry:
-  br label %loop
-
-loop:
-  %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ]
-  %cmp = icmp ult i8 %val, 128
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %inc = add nuw i8 %val, 2
-  br label %if.end
-
-if.else:
-  %inc1 = add nuw i8 %val, 1
-  br label %if.end
-
-if.end:
-  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
-  %cmp1 = icmp ult i8 %inc2, 253
-  br i1 %cmp1, label %exit, label %loop
-
-exit:
-  ret i8 %inc2
-}
-
-; CHECK-COMMON-LABEL: phi_multiple_undefs
-; CHECK-COMMON-NOT:   uxt
-define i16 @phi_multiple_undefs(i16 zeroext %arg) {
-entry:
-  br label %loop
-
-loop:
-  %val = phi i16 [ undef, %entry ], [ %inc2, %if.end ]
-  %cmp = icmp ult i16 %val, 128
-  br i1 %cmp, label %if.then, label %if.else
-
-if.then:
-  %inc = add nuw i16 %val, 2
-  br label %if.end
-
-if.else:
-  %inc1 = add nuw i16 %val, 1
-  br label %if.end
-
-if.end:
-  %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
-  %unrelated = phi i16 [ undef, %if.then ], [ %arg, %if.else ]
-  %cmp1 = icmp ult i16 %inc2, 253
-  br i1 %cmp1, label %loop, label %exit
-
-exit:
-  ret i16 %unrelated
-}
-
-; CHECK-COMMON-LABEL: promote_arg_return
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: strb
-define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) {
-  %add = add nuw i16 %arg1, 15
-  %mul = mul nuw nsw i16 %add, 3
-  %cmp = icmp ult i16 %mul, %arg2
-  %conv = zext i1 %cmp to i8
-  store i8 %conv, i8* %res
-  ret i16 %arg1
-}
-
-; CHECK-COMMON-LABEL: signext_bitcast_phi_select
-; CHECK: uxth [[UXT:r[0-9]+]], r0
-; CHECK: sxth [[SXT:r[0-9]+]], [[UXT]]
-; CHECK: cmp [[SXT]],
-; CHECK-NOT: xth
-define i16 @signext_bitcast_phi_select(i16 signext %start, i16* %in) {
-entry:
-  %const = bitcast i16 -1 to i16
-  br label %for.body
-
-for.body:
-  %idx = phi i16 [ %select, %if.else ], [ %start, %entry ]
-  %cmp.i = icmp sgt i16 %idx, %const
-  br i1 %cmp.i, label %exit, label %if.then
-
-if.then:
-  %idx.next = getelementptr i16, i16* %in, i16 %idx
-  %ld = load i16, i16* %idx.next, align 2
-  %cmp1.i = icmp eq i16 %ld, %idx
-  br i1 %cmp1.i, label %exit, label %if.else
-
-if.else:
-  %lobit = lshr i16 %idx, 15
-  %lobit.not = xor i16 %lobit, 1
-  %select = add nuw i16 %lobit.not, %idx
-  br label %for.body
-
-exit:
-  %res = phi i16 [ %ld, %if.then ], [ 0, %for.body ]
-  ret i16 %res
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-pointers.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
deleted file mode 100644
index e7f800232d45d..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-pointers.ll
+++ /dev/null
@@ -1,135 +0,0 @@
-; RUN: llc -mtriple=thumbv8 -arm-disable-cgp=false %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv8 -arm-disable-cgp=false %s -o - | FileCheck %s
-
-; CHECK-LABEL: phi_pointers
-; CHECK-NOT: uxt
-define void @phi_pointers(i16* %a, i16* %b, i8 zeroext %M, i8 zeroext %N) {
-entry:
-  %add = add nuw i8 %M, 1
-  %and = and i8 %add, 1
-  %cmp = icmp ugt i8 %add, %N
-  %base = select i1 %cmp, i16* %a, i16* %b
-  %other = select i1 %cmp, i16* %b, i16* %b
-  br label %loop
-
-loop:
-  %ptr = phi i16* [ %base, %entry ], [ %gep, %loop ]
-  %idx = phi i8 [ %and, %entry ], [ %inc, %loop ]
-  %load = load i16, i16* %ptr, align 2
-  %inc = add nuw nsw i8 %idx, 1
-  %gep = getelementptr inbounds i16, i16* %ptr, i8 %inc
-  %cond = icmp eq i16* %gep, %other
-  br i1 %cond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: phi_pointers_null
-; CHECK-NOT: uxt
-define void @phi_pointers_null(i16* %a, i16* %b, i8 zeroext %M, i8 zeroext %N) {
-entry:
-  %add = add nuw i8 %M, 1
-  %and = and i8 %add, 1
-  %cmp = icmp ugt i8 %add, %N
-  %base = select i1 %cmp, i16* %a, i16* %b
-  %other = select i1 %cmp, i16* %b, i16* %b
-  %cmp.1 = icmp eq i16* %base, %other
-  br i1 %cmp.1, label %fail, label %loop
-
-fail:
-  br label %loop
-
-loop:
-  %ptr = phi i16* [ %base, %entry ], [ null, %fail ], [ %gep, %if.then ]
-  %idx = phi i8 [ %and, %entry ], [ 0, %fail ], [ %inc, %if.then ]
-  %undef = icmp eq i16* %ptr, undef
-  br i1 %undef, label %exit, label %if.then
-
-if.then:
-  %load = load i16, i16* %ptr, align 2
-  %inc = add nuw nsw i8 %idx, 1
-  %gep = getelementptr inbounds i16, i16* %ptr, i8 %inc
-  %cond = icmp eq i16* %gep, %other
-  br i1 %cond, label %exit, label %loop
-
-exit:
-  ret void
-}
-
-declare i8 @do_something_with_ptr(i8, i16*)
-
-; CHECK-LABEL: call_pointer
-; CHECK-NOT: uxt
-define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, i16* %a, i16* %b) {
-  %or = or i8 %x, %y
-  %shr = lshr i8 %or, 1
-  %add = add nuw i8 %shr, 2
-  %cmp = icmp ne i8 %add, 0
-  %ptr = select i1 %cmp, i16* %a, i16* %b
-  %call = tail call zeroext i8 @do_something_with_ptr(i8 %shr, i16* %ptr)
-  ret i8 %call
-}
-
-; CHECK-LABEL: pointer_to_pointer
-; CHECK-NOT: uxt
-define i16 @pointer_to_pointer(i16** %arg, i16 zeroext %limit) {
-entry:
-  %addr = load i16*, i16** %arg
-  %val = load i16, i16* %addr
-  %add = add nuw i16 %val, 7
-  %cmp = icmp ult i16 %add, 256
-  %res = select i1 %cmp, i16 128, i16 255
-  ret i16 %res
-}
-
-; CHECK-LABEL: gep_2d_array
-; CHECK-NOT: uxt
-define i8 @gep_2d_array(i8** %a, i8 zeroext %arg) {
-entry:
-  %arrayidx.us = getelementptr inbounds i8*, i8** %a, i32 0
-  %0 = load i8*, i8** %arrayidx.us, align 4
-  %1 = load i8, i8* %0, align 1
-  %sub = sub nuw i8 %1, 1
-  %cmp = icmp ult i8 %sub, %arg
-  %res = select i1 %cmp, i8 27, i8 54
-  ret i8 %res
-}
-
-; CHECK-LABEL: gep_2d_array_loop
-; CHECK-NOT: uxt
-define void @gep_2d_array_loop(i16** nocapture readonly %a, i16** nocapture readonly %b, i32 %N) {
-entry:
-  %cmp30 = icmp eq i32 %N, 0
-  br i1 %cmp30, label %for.cond.cleanup, label %for.cond1.preheader.us
-
-for.cond1.preheader.us:
-  %y.031.us = phi i32 [ %inc13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
-  br label %for.body4.us
-
-for.body4.us:
-  %x.029.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
-  %arrayidx.us = getelementptr inbounds i16*, i16** %a, i32 %x.029.us
-  %0 = load i16*, i16** %arrayidx.us, align 4
-  %arrayidx5.us = getelementptr inbounds i16, i16* %0, i32 %y.031.us
-  %1 = load i16, i16* %arrayidx5.us, align 2
-  %dec.us = add nuw i16 %1, -1
-  %cmp6.us = icmp ult i16 %dec.us, 16383
-  %shl.us = shl nuw i16 %dec.us, 2
-  %spec.select.us = select i1 %cmp6.us, i16 %shl.us, i16 %dec.us
-  %arrayidx10.us = getelementptr inbounds i16*, i16** %b, i32 %x.029.us
-  %2 = load i16*, i16** %arrayidx10.us, align 4
-  %arrayidx11.us = getelementptr inbounds i16, i16* %2, i32 %y.031.us
-  store i16 %spec.select.us, i16* %arrayidx11.us, align 2
-  %inc.us = add nuw i32 %x.029.us, 1
-  %exitcond = icmp eq i32 %inc.us, %N
-  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
-
-for.cond1.for.cond.cleanup3_crit_edge.us:
-  %inc13.us = add nuw i32 %y.031.us, 1
-  %exitcond32 = icmp eq i32 %inc13.us, %N
-  br i1 %exitcond32, label %for.cond.cleanup, label %for.cond1.preheader.us
-
-for.cond.cleanup:
-  ret void
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
deleted file mode 100644
index 15030bd38660d..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed-icmps.ll
+++ /dev/null
@@ -1,108 +0,0 @@
-; RUN: llc -mtriple=thumbv8m.main -mcpu=cortex-m33 -arm-disable-cgp=false -mattr=-use-misched %s -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-NODSP
-; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP
-; RUN: llc -mtriple=thumbv8 %s -arm-disable-cgp=false -arm-enable-scalar-dsp=true -arm-enable-scalar-dsp-imms=true -o - | FileCheck %s --check-prefix=CHECK-COMMON --check-prefix=CHECK-DSP-IMM
-
-; CHECK-COMMON-LABEL: eq_sgt
-; CHECK-NODSP: add
-; CHECK-NODSP: uxtb
-; CHECK-NODSP: sxtb
-; CHECK-NODSP: cmp
-; CHECK-NODSP: sub
-; CHECK-NODSP: sxtb
-; CHECK-NODSP: cmp
-
-; CHECK-DSP: uadd8
-; CHECK-DSP: sub
-; CHECK-DSP: cmp
-; CHECK-DSP: sxtb
-; CHECK-DSP: sxtb
-; CHECK-DSP: cmp
-
-; CHECK-DSP-IMM: uadd8 [[ADD:r[0-9]+]],
-; CHECK-DSP-IMM: cmp [[ADD]],
-; CHECK-DSP-IMM: subs [[SUB:r[0-9]+]],
-; CHECK-DSP-IMM: sxtb [[SEXT0:r[0-9]+]], [[ADD]]
-; CHECK-DSP-IMM: sxtb [[SEXT1:r[0-9]+]], [[SUB]]
-; CHECK-DSP-IMM: cmp [[SEXT1]], [[SEXT0]]
-define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) {
-entry:
-  %load0 = load i8, i8* %x, align 1
-  %load1 = load i8, i8* %y, align 1
-  %add = add i8 %load0, %z
-  %sub = sub i8 %load1, 1
-  %cmp = icmp eq i8 %add, 200
-  %cmp1 = icmp sgt i8 %sub, %add
-  %res0 = select i1 %cmp, i8 35, i8 47
-  %res1 = select i1 %cmp1, i8 %res0, i8 %sub
-  ret i8 %res1
-}
-
-; CHECK-COMMON-LABEL: ugt_slt
-; CHECK-NODSP: sub
-; CHECK-NODSP: sxth
-; CHECK-NODSP: uxth
-; CHECK-NODSP: add
-; CHECK-NODSP: sxth
-; CHECK-NODSP: cmp
-; CHECK-NODSP: cmp
-
-; CHECK-DSP: sub
-; CHECK-DSP: sxth
-; CHECK-DSP: add
-; CHECK-DSP: uxth
-; CHECK-DSP: sxth
-; CHECK-DSP: cmp
-; CHECK-DSP: cmp
-
-; CHECK-DSP-IMM: uadd16 [[ADD:r[0-9]+]],
-; CHECK-DSP-IMM: sxth.w [[SEXT:r[0-9]+]], [[ADD]]
-; CHECK-DSP-IMM: sxth [[ARG:r[0-9]+]], r2
-; CHECK-DSP-IMM: cmp [[SEXT]], [[ARG]]
-; CHECK-DSP-IMM-NOT: uxt
-; CHECK-DSP-IMM: movs [[ONE:r[0-9]+]], #1
-; CHECK-DSP-IMM: usub16 [[SUB:r[0-9]+]], r1, [[ONE]]
-; CHECK-DSP-IMM: cmp [[SUB]], r2
-define i16 @ugt_slt(i16 *%x, i16 zeroext %y, i16 zeroext %z) {
-entry:
-  %load0 = load i16, i16* %x, align 1
-  %add = add i16 %load0, %z
-  %sub = sub i16 %y, 1
-  %cmp = icmp slt i16 %add, %z
-  %cmp1 = icmp ugt i16 %sub, %z
-  %res0 = select i1 %cmp, i16 35, i16 -1
-  %res1 = select i1 %cmp1, i16 %res0, i16 0
-  ret i16 %res1
-}
-
-; CHECK-COMMON-LABEL: urem_trunc_icmps
-; CHECK-COMMON-NOT: uxt
-; CHECK-COMMON: sxtb [[SEXT:r[0-9]+]],
-; CHECK-COMMON: cmp [[SEXT]], #7
-define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
-entry:
-  %ptr = load i16*, i16** %in, align 4
-  %ld = load i16, i16* %ptr, align 2
-  %cmp.i = icmp eq i16 %ld, 0
-  br i1 %cmp.i, label %exit, label %cond.false.i
-
-cond.false.i:
-  %rem = urem i16 5, %ld
-  %extract.t = trunc i16 %rem to i8
-  br label %body
-
-body:
-  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
-  %cmp = icmp sgt i8 %cond.in.i.off0, 7
-  %conv5 = zext i1 %cmp to i32
-  store i32 %conv5, i32* %g, align 4
-  %.pr = load i32, i32* %k, align 4
-  %tobool13150 = icmp eq i32 %.pr, 0
-  br i1 %tobool13150, label %for.inc, label %exit
-
-for.inc:
-  %add = add nuw i8 %cond.in.i.off0, 1
-  br label %body
-
-exit:
-  ret void
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed.ll
deleted file mode 100644
index 596893724d203..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-signed.ll
+++ /dev/null
@@ -1,89 +0,0 @@
-; RUN: llc -mtriple=thumbv7em -arm-disable-cgp=false %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv8m.main -mattr=+dsp -arm-disable-cgp=false %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv7 %s -arm-disable-cgp=false -o - | FileCheck %s
-; RUN: llc -mtriple=armv8 %s -arm-disable-cgp=false -o - | FileCheck %s
-
-; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends.
-; CHECK-LABEL: test_signed_load:
-; CHECK: uxth
-define i16 @test_signed_load(i16* %ptr) {
-  %load = load i16, i16* %ptr
-  %conv0 = zext i16 %load to i32
-  %conv1 = sext i16 %load to i32
-  %cmp = icmp eq i32 %conv0, %conv1
-  %conv2 = zext i1 %cmp to i16
-  ret i16 %conv2
-}
-
-; Don't allow sign bit generating opcodes.
-; CHECK-LABEL: test_ashr:
-; CHECK: sxth
-define i16 @test_ashr(i16 zeroext %arg) {
-  %ashr = ashr i16 %arg, 1
-  %cmp = icmp eq i16 %ashr, 0
-  %conv = zext i1 %cmp to i16
-  ret i16 %conv 
-}
-
-; CHECK-LABEL: test_sdiv:
-; CHECK: sxth
-define i16 @test_sdiv(i16 zeroext %arg) {
-  %sdiv = sdiv i16 %arg, 2
-  %cmp = icmp ne i16 %sdiv, 0
-  %conv = zext i1 %cmp to i16
-  ret i16 %conv 
-}
-
-; CHECK-LABEL: test_srem
-; CHECK: sxth
-define i16 @test_srem(i16 zeroext %arg) {
-  %srem = srem i16 %arg, 4
-  %cmp = icmp ne i16 %srem, 0
-  %conv = zext i1 %cmp to i16
-  ret i16 %conv 
-}
-
-; CHECK-LABEL: test_signext_b
-; CHECK: ldrb [[LDR:r[0-9]+]], [r0]
-; CHECK: uxtab [[UXT:r[0-9]+]], [[LDR]], r1
-; CHECK: cm{{.*}} [[UXT]], #128
-define i32 @test_signext_b(i8* %ptr, i8 signext %arg) {
-entry:
-  %0 = load i8, i8* %ptr, align 1
-  %1 = add nuw nsw i8 %0, %arg
-  %cmp = icmp ult i8 %1, 128
-  %res = select i1 %cmp, i32 42, i32 20894
-  ret i32 %res
-}
-
-; CHECK-LABEL: test_signext_b_ult_slt
-; CHECK: ldrb [[LDR:r[0-9]+]], [r0]
-; CHECK: uxtab [[ADD:r[0-9]+]], [[LDR]], r1
-; CHECK: uxtb [[UXT:r[0-9]+]], r1
-; CHECK: cmp [[ADD]], [[UXT]]
-; CHECK: uxtb [[TRUNC:r[0-9]+]], [[ADD]]
-; CHECK: cmp [[TRUNC]], #127
-define i32 @test_signext_b_ult_slt(i8* %ptr, i8 signext %arg) {
-entry:
-  %0 = load i8, i8* %ptr, align 1
-  %1 = add nuw nsw i8 %0, %arg
-  %cmp = icmp sle i8 %1, 126
-  %cmp.1 = icmp ule i8 %1, %arg
-  %or = and i1 %cmp, %cmp.1
-  %res = select i1 %or, i32 42, i32 57
-  ret i32 %res
-}
-
-; CHECK-LABEL: test_signext_h
-; CHECK: ldrh [[LDR:r[0-9]+]], [r0]
-; CHECK: uxtah [[ADD:r[0-9]+]], [[LDR]], r1
-; CHECK: cm{{.*}} [[ADD]],
-define i32 @test_signext_h(i16* %ptr, i16 signext %arg) {
-entry:
-  %0 = load i16, i16* %ptr, align 1
-  %1 = add nuw nsw i16 %0, %arg
-  %cmp = icmp ult i16 %1, 32768
-  %res = select i1 %cmp, i32 42, i32 20894
-  ret i32 %res
-}
-
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-switch.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-switch.ll
deleted file mode 100644
index 29c35fbc96e00..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-switch.ll
+++ /dev/null
@@ -1,168 +0,0 @@
-; RUN: llc -mtriple=thumbv7em %s -arm-disable-cgp=false -o - | FileCheck %s
-; RUN: llc -mtriple=thumbv7-linux-android %s -arm-disable-cgp=false -o - | FileCheck %s
-
-; CHECK-LABEL: truncate_source_phi_switch
-; CHECK: ldrb
-; CHECK: uxtb
-define void @truncate_source_phi_switch(i8* %memblock, i8* %store, i16 %arg) {
-entry:
-  %pre = load i8, i8* %memblock, align 1
-  %conv = trunc i16 %arg to i8
-  br label %header
-
-header:
-  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
-  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
-  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
-  switch i8 %phi.0, label %default [
-    i8 43, label %for.inc.i
-    i8 45, label %for.inc.i.i
-  ]
-
-for.inc.i:
-  %xor = xor i8 %phi.1, 1
-  br label %latch
-
-for.inc.i.i:
-  %and = and i8 %phi.1, 3
-  br label %latch
-
-default:
-  %sub = sub i8 %phi.0, 1
-  %cmp2 = icmp ugt i8 %sub, 4
-  br i1 %cmp2, label %latch, label %exit
-
-latch:
-  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
-  %count = add nuw i8 %phi.2, 1
-  store i8 %count, i8* %store, align 1
-  br label %header
-
-exit:
-  ret void
-}
-
-; CHECK-LABEL: icmp_switch_source:
-; CHECK-NOT: uxt
-define i16 @icmp_switch_source(i16 zeroext %arg) {
-entry:
-  %conv = add nuw i16 %arg, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  switch i16 %arg, label %default [
-    i16 0, label %sw.bb
-    i16 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
-; CHECK-LABEL: icmp_switch_narrow_source:
-; CHECK-NOT: uxt
-define i16 @icmp_switch_narrow_source(i8 zeroext %arg) {
-entry:
-  %conv = zext i8 %arg to i16
-  %add = add nuw i16 %conv, 15
-  %mul = mul nuw nsw i16 %add, 3
-  switch i8 %arg, label %default [
-    i8 0, label %sw.bb
-    i8 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
-; CHECK-LABEL: icmp_switch_trunc:
-; CHECK-NOT: uxt
-define i16 @icmp_switch_trunc(i16 zeroext %arg) {
-entry:
-  %conv = add nuw i16 %arg, 15
-  %mul = mul nuw nsw i16 %conv, 3
-  %trunc = trunc i16 %arg to i3
-  switch i3 %trunc, label %default [
-    i3 0, label %sw.bb
-    i3 1, label %sw.bb.i
-  ]
-
-sw.bb:
-  %cmp0 = icmp ult i16 %mul, 127
-  %select = select i1 %cmp0, i16 %mul, i16 127
-  br label %exit
-
-sw.bb.i:
-  %cmp1 = icmp ugt i16 %mul, 34
-  %select.i = select i1 %cmp1, i16 %mul, i16 34
-  br label %exit
-
-default:
-  br label %exit
-
-exit:
-  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
-  ret i16 %res
-}
-
-%class.ae = type { i8 }
-%class.x = type { i8 }
-%class.v = type { %class.q }
-%class.q = type { i16 }
-declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
-declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
-
-; CHECK-LABEL: trunc_i16_i9_switch
-; CHECK-NOT: uxt
-define i32 @trunc_i16_i9_switch(%class.ae* %this) {
-entry:
-  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
-  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
-  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
-  %1 = load i16, i16* %0, align 2
-  %2 = trunc i16 %1 to i9
-  %trunc = and i9 %2, -64
-  switch i9 %trunc, label %cleanup.fold.split [
-    i9 0, label %cleanup
-    i9 -256, label %if.then7
-  ]
-
-if.then7:
-  %3 = and i16 %1, 7
-  %tobool = icmp eq i16 %3, 0
-  %cond = select i1 %tobool, i32 2, i32 1
-  br label %cleanup
-
-cleanup.fold.split:
-  br label %cleanup
-
-cleanup:
-  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
-  ret i32 %retval.0
-}
diff --git a/llvm/test/CodeGen/ARM/CGP/clear-structures.ll b/llvm/test/CodeGen/ARM/CGP/clear-structures.ll
deleted file mode 100644
index 86459c35dd60d..0000000000000
--- a/llvm/test/CodeGen/ARM/CGP/clear-structures.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; RUN: opt -arm-codegenprepare -arm-disable-cgp=false -mtriple=armv8 -verify %s -S -o - | FileCheck %s
-
-; CHECK: clear_structures
-define i32 @clear_structures(i8* nocapture readonly %fmt, [1 x i32] %ap.coerce, i8* %out, void (i32, i8*)* nocapture %write) {
-entry:
-  br label %while.cond.outer
-
-while.cond.outer:
-  %fmt.addr.0.ph = phi i8* [ %fmt, %entry ], [ %fmt.addr.3, %while.cond.outer.backedge ]
-  %0 = load i8, i8* %fmt.addr.0.ph, align 1
-  br label %while.cond
-
-while.cond:
-  switch i8 %0, label %while.cond [
-    i8 0, label %while.end48
-    i8 37, label %while.cond2
-  ]
-
-while.cond2:
-  %flags.0 = phi i32 [ %or, %while.cond2 ], [ 0, %while.cond ]
-  %fmt.addr.0.pn = phi i8* [ %fmt.addr.1, %while.cond2 ], [ %fmt.addr.0.ph, %while.cond ]
-  %fmt.addr.1 = getelementptr inbounds i8, i8* %fmt.addr.0.pn, i32 1
-  %1 = load i8, i8* %fmt.addr.1, align 1
-  ; CHECK: add i8 [[LOAD:%[^ ]+]], -32
-  %sub = add i8 %1, -32
-  %conv6 = zext i8 %sub to i32
-  %shl = shl i32 1, %conv6
-  %and = and i32 %shl, 75785
-  %tobool7 = icmp eq i32 %and, 0
-  %or = or i32 %shl, %flags.0
-  br i1 %tobool7, label %while.cond10.preheader, label %while.cond2
-
-while.cond10.preheader:
-  ; CHECK: [[ADD:%[^ ]+]] = add i8 [[LOAD]], -48
-  ; CHECK: icmp ult i8 [[ADD]], 10
-  %.off = add i8 %1, -48
-  %2 = icmp ult i8 %.off, 10
-  br i1 %2, label %while.cond10, label %while.end18.split
-
-while.cond10:
-  br label %while.cond10
-
-while.end18.split:
-  %cmp20 = icmp eq i8 %1, 46
-  br i1 %cmp20, label %if.then22, label %cond.end
-
-if.then22:
-  %incdec.ptr23 = getelementptr inbounds i8, i8* %fmt.addr.0.pn, i32 2
-  %.pr74 = load i8, i8* %incdec.ptr23, align 1
-  ; CHECK: [[LOAD2:[^ ]+]] = load i8, i8*
-  ; CHECK: [[ZEXT:[^ ]+]] = zext i8 [[LOAD2]] to i32
-  ; CHECK: sub i32 [[ZEXT]], 48
-  %.pr74.off = add i8 %.pr74, -48
-  %3 = icmp ult i8 %.pr74.off, 10
-  br i1 %3, label %while.cond24, label %cond.end
-
-while.cond24:
-  br label %while.cond24
-
-cond.end:
-  %fmt.addr.3 = phi i8* [ %fmt.addr.1, %while.end18.split ], [ %incdec.ptr23, %if.then22 ]
-  %and39 = and i32 %flags.0, 2048
-  %tobool40 = icmp eq i32 %and39, 0
-  br i1 %tobool40, label %while.cond.outer.backedge, label %if.then43
-
-while.cond.outer.backedge:
-  br label %while.cond.outer
-
-if.then43:
-  tail call void %write(i32 43, i8* %out) #1
-  br label %while.cond.outer.backedge
-
-while.end48:
-  ret i32 undef
-}
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir
index de5545594bf39..9d66209211058 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-consts.mir
@@ -44,7 +44,7 @@ body:             |
     %3(s1) = G_CONSTANT i1 1
     G_STORE %3(s1), %4(p0) :: (store 1)
     ; CHECK-NOT: G_CONSTANT i1
-    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK: [[EXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: {{%[0-9]+}}:_(s1) = G_TRUNC [[EXT]](s32)
     ; CHECK-NOT: G_CONSTANT i1
 
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
index b413130558e63..0cdab2c41f798 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
@@ -1131,10 +1131,9 @@ body:             |
     ; SOFT-NOT: G_FCMP
     ; For soft float we just need to return a '-1' constant, but the truncation
     ; to 1 bit is converted by the combiner to the following masking sequence.
-    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; SOFT: [[MASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SOFT: [[RCOPY:%[0-9]+]]:_(s32) = COPY [[R]](s32)
-    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_AND [[RCOPY]], [[MASK]]
+    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_AND [[RCOPY]], [[R]]
     ; SOFT-NOT: G_FCMP
     ; CHECK: $r0 = COPY [[REXT]]
 ...
@@ -1853,11 +1852,10 @@ body:             |
     ; HARD: [[R:%[0-9]+]]:_(s1) = G_FCMP floatpred(true), [[X]](s64), [[Y]]
     ; HARD: [[REXT:%[0-9]+]]:_(s32) = G_ZEXT [[R]](s1)
     ; SOFT-NOT: G_FCMP
-    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; The result needs to be truncated, and the combiner turns the truncation
     ; into the following masking sequence.
     ; SOFT: [[MASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; SOFT: [[RCOPY:%[0-9]+]]:_(s32) = COPY [[R]]
+    ; SOFT: [[RCOPY:%[0-9]+]]:_(s32) = COPY [[MASK]]
     ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_AND [[RCOPY]], [[MASK]]
     ; SOFT-NOT: G_FCMP
     %7(s32) = G_ZEXT %6(s1)
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index dd741388d7499..3fd35bd1e9d0a 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -40,7 +40,7 @@
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Transform functions to use DSP intrinsics
 ; CHECK-NEXT:      Interleaved Access Pass
-; CHECK-NEXT:      ARM IR optimizations
+; CHECK-NEXT:      Type Promotion
 ; CHECK-NEXT:      Dominator Tree Construction
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      CodeGen Prepare
@@ -154,6 +154,7 @@
 ; CHECK-NEXT:      ARM constant island placement and branch shortening pass
 ; CHECK-NEXT:      MachineDominator Tree Construction
 ; CHECK-NEXT:      Machine Natural Loop Construction
+; CHECK-NEXT:      ReachingDefAnalysis
 ; CHECK-NEXT:      ARM Low Overhead Loops pass
 ; CHECK-NEXT:      Contiguously Lay Out Funclets
 ; CHECK-NEXT:      StackMap Liveness Analysis
diff --git a/llvm/test/CodeGen/ARM/addsubo-legalization.ll b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
index e9143d814d3de..e3a48ed0c14f1 100644
--- a/llvm/test/CodeGen/ARM/addsubo-legalization.ll
+++ b/llvm/test/CodeGen/ARM/addsubo-legalization.ll
@@ -95,48 +95,19 @@ define <2 x i1> @usubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-LABEL: saddo:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT:    vadd.i64 q8, q10, q9
-; CHECK-NEXT:    vmov.32 r2, d20[0]
-; CHECK-NEXT:    vmov.32 r1, d20[1]
-; CHECK-NEXT:    vmov.32 r12, d16[0]
-; CHECK-NEXT:    vmov.32 r8, d16[1]
-; CHECK-NEXT:    vmov.32 lr, d17[0]
-; CHECK-NEXT:    vmov.32 r4, d21[0]
-; CHECK-NEXT:    vmov.32 r5, d17[1]
-; CHECK-NEXT:    vmov.32 r6, d18[1]
-; CHECK-NEXT:    vmov.32 r7, d21[1]
-; CHECK-NEXT:    subs.w r2, r12, r2
-; CHECK-NEXT:    vmov.32 r2, d19[1]
-; CHECK-NEXT:    sbcs.w r1, r8, r1
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    subs.w r4, lr, r4
-; CHECK-NEXT:    sbcs.w r7, r5, r7
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r3, #-1
-; CHECK-NEXT:    asrs r7, r6, #31
-; CHECK-NEXT:    vdup.32 d21, r3
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    vdup.32 d20, r1
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vqadd.s64 q10, q9, q8
+; CHECK-NEXT:    vadd.i64 q8, q9, q8
+; CHECK-NEXT:    vceq.i32 q9, q8, q10
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
-; CHECK-NEXT:    asrs r2, r2, #31
-; CHECK-NEXT:    vdup.32 d19, r2
-; CHECK-NEXT:    vdup.32 d18, r7
-; CHECK-NEXT:    veor q9, q9, q10
+; CHECK-NEXT:    vrev64.32 q10, q9
+; CHECK-NEXT:    vand q9, q9, q10
+; CHECK-NEXT:    vmvn q9, q9
 ; CHECK-NEXT:    vmovn.i64 d18, q9
 ; CHECK-NEXT:    vmov r2, r1, d18
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    bx lr
   %x = load <2 x i64>, <2 x i64>* %ptr, align 8
   %y = load <2 x i64>, <2 x i64>* %ptr2, align 8
   %s = call {<2 x i64>, <2 x i1>} @llvm.sadd.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
@@ -149,64 +120,19 @@ define <2 x i1> @saddo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 define <2 x i1> @ssubo(<2 x i64> *%ptr, <2 x i64> *%ptr2) {
 ; CHECK-LABEL: ssubo:
 ; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r1]
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    vld1.64 {d20, d21}, [r0]
-; CHECK-NEXT:    vsub.i64 q8, q10, q9
-; CHECK-NEXT:    vmov.32 r1, d20[0]
-; CHECK-NEXT:    vmov.32 r12, d20[1]
-; CHECK-NEXT:    vmov.32 r3, d16[0]
-; CHECK-NEXT:    vmov.32 lr, d16[1]
-; CHECK-NEXT:    vmov.32 r4, d21[0]
-; CHECK-NEXT:    vmov.32 r5, d17[0]
-; CHECK-NEXT:    vmov.32 r6, d21[1]
-; CHECK-NEXT:    vmov.32 r7, d17[1]
-; CHECK-NEXT:    vmov.32 r8, d18[1]
-; CHECK-NEXT:    subs r1, r3, r1
-; CHECK-NEXT:    vmov.32 r3, d18[0]
-; CHECK-NEXT:    sbcs.w r1, lr, r12
-; CHECK-NEXT:    vmov.32 r12, d19[0]
-; CHECK-NEXT:    mov.w r1, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r1, #1
-; CHECK-NEXT:    subs r5, r5, r4
-; CHECK-NEXT:    vmov.32 r5, d19[1]
-; CHECK-NEXT:    sbcs r7, r6
-; CHECK-NEXT:    mov.w r7, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r7, #1
-; CHECK-NEXT:    cmp r7, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r7, #-1
-; CHECK-NEXT:    vdup.32 d21, r7
-; CHECK-NEXT:    rsbs r3, r3, #0
-; CHECK-NEXT:    sbcs.w r3, r2, r8
-; CHECK-NEXT:    mov.w r3, #0
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r3, #1
-; CHECK-NEXT:    rsbs.w r6, r12, #0
-; CHECK-NEXT:    sbcs.w r6, r2, r5
-; CHECK-NEXT:    it lt
-; CHECK-NEXT:    movlt r2, #1
-; CHECK-NEXT:    cmp r2, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r2, #-1
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    vdup.32 d19, r2
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r3, #-1
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    it ne
-; CHECK-NEXT:    movne.w r1, #-1
-; CHECK-NEXT:    vdup.32 d18, r3
-; CHECK-NEXT:    vdup.32 d20, r1
-; CHECK-NEXT:    veor q9, q9, q10
+; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
+; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
+; CHECK-NEXT:    vqsub.s64 q10, q9, q8
+; CHECK-NEXT:    vsub.i64 q8, q9, q8
+; CHECK-NEXT:    vceq.i32 q9, q8, q10
 ; CHECK-NEXT:    vst1.64 {d16, d17}, [r0]
+; CHECK-NEXT:    vrev64.32 q10, q9
+; CHECK-NEXT:    vand q9, q9, q10
+; CHECK-NEXT:    vmvn q9, q9
 ; CHECK-NEXT:    vmovn.i64 d18, q9
 ; CHECK-NEXT:    vmov r2, r1, d18
 ; CHECK-NEXT:    mov r0, r2
-; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; CHECK-NEXT:    bx lr
   %x = load <2 x i64>, <2 x i64>* %ptr, align 8
   %y = load <2 x i64>, <2 x i64>* %ptr2, align 8
   %s = call {<2 x i64>, <2 x i1>} @llvm.ssub.with.overflow.v2i64(<2 x i64> %x, <2 x i64> %y)
diff --git a/llvm/test/CodeGen/ARM/cmov_fp16.ll b/llvm/test/CodeGen/ARM/cmov_fp16.ll
new file mode 100644
index 0000000000000..925fed5828112
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/cmov_fp16.ll
@@ -0,0 +1,261 @@
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+fullfp16 %s -o - | FileCheck %s --check-prefixes CHECK-THUMB,CHECK
+; RUN: llc -mtriple=armv8.2a-arm-none-eabi -mattr=+fullfp16 %s -o - | FileCheck %s --check-prefixes CHECK-ARM,CHECK
+
+define i32 @test_ne(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ne:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vseleq.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp ne i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_eq(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_eq:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vseleq.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp eq i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_gt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_gt:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vselgt.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_ge(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_ge:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r1
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r0
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vselge.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp sge i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_lt(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_lt:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vselge.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_le(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-LABEL: test_le:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov s2, r0
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    vmov s0, r1
+; CHECK-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-NEXT:    vselgt.f16 s0, s0, s2
+; CHECK-NEXT:    vmov.f16 r0, s0
+; CHECK-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp sle i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_hi(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-THUMB-LABEL: test_hi:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    vmov s2, r0
+; CHECK-THUMB-NEXT:    cmp r2, r3
+; CHECK-THUMB-NEXT:    vmov s0, r1
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-THUMB-NEXT:    it hi
+; CHECK-THUMB-NEXT:    vmovhi.f32 s0, s2
+; CHECK-THUMB-NEXT:    vmov.f16 r0, s0
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ARM-LABEL: test_hi:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    vmov s2, r0
+; CHECK-ARM-NEXT:    cmp r2, r3
+; CHECK-ARM-NEXT:    vmov s0, r1
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-ARM-NEXT:    vmovhi.f32 s0, s2
+; CHECK-ARM-NEXT:    vmov.f16 r0, s0
+; CHECK-ARM-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_hs(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-THUMB-LABEL: test_hs:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    vmov s2, r0
+; CHECK-THUMB-NEXT:    cmp r2, r3
+; CHECK-THUMB-NEXT:    vmov s0, r1
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-THUMB-NEXT:    it hs
+; CHECK-THUMB-NEXT:    vmovhs.f32 s0, s2
+; CHECK-THUMB-NEXT:    vmov.f16 r0, s0
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ARM-LABEL: test_hs:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    vmov s2, r0
+; CHECK-ARM-NEXT:    cmp r2, r3
+; CHECK-ARM-NEXT:    vmov s0, r1
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-ARM-NEXT:    vmovhs.f32 s0, s2
+; CHECK-ARM-NEXT:    vmov.f16 r0, s0
+; CHECK-ARM-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp uge i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_lo(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-THUMB-LABEL: test_lo:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    vmov s2, r0
+; CHECK-THUMB-NEXT:    cmp r2, r3
+; CHECK-THUMB-NEXT:    vmov s0, r1
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-THUMB-NEXT:    it lo
+; CHECK-THUMB-NEXT:    vmovlo.f32 s0, s2
+; CHECK-THUMB-NEXT:    vmov.f16 r0, s0
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ARM-LABEL: test_lo:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    vmov s2, r0
+; CHECK-ARM-NEXT:    cmp r2, r3
+; CHECK-ARM-NEXT:    vmov s0, r1
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-ARM-NEXT:    vmovlo.f32 s0, s2
+; CHECK-ARM-NEXT:    vmov.f16 r0, s0
+; CHECK-ARM-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
+define i32 @test_ls(i32 %x, i32 %y, i32 %a, i32 %b) {
+; CHECK-THUMB-LABEL: test_ls:
+; CHECK-THUMB:       @ %bb.0: @ %entry
+; CHECK-THUMB-NEXT:    vmov s2, r0
+; CHECK-THUMB-NEXT:    cmp r2, r3
+; CHECK-THUMB-NEXT:    vmov s0, r1
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-THUMB-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-THUMB-NEXT:    it ls
+; CHECK-THUMB-NEXT:    vmovls.f32 s0, s2
+; CHECK-THUMB-NEXT:    vmov.f16 r0, s0
+; CHECK-THUMB-NEXT:    bx lr
+;
+; CHECK-ARM-LABEL: test_ls:
+; CHECK-ARM:       @ %bb.0: @ %entry
+; CHECK-ARM-NEXT:    vmov s2, r0
+; CHECK-ARM-NEXT:    cmp r2, r3
+; CHECK-ARM-NEXT:    vmov s0, r1
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s2, s2
+; CHECK-ARM-NEXT:    vcvt.f16.u32 s0, s0
+; CHECK-ARM-NEXT:    vmovls.f32 s0, s2
+; CHECK-ARM-NEXT:    vmov.f16 r0, s0
+; CHECK-ARM-NEXT:    bx lr
+entry:
+  %x.half = uitofp i32 %x to half
+  %y.half = uitofp i32 %y to half
+  %cmp = icmp ule i32 %a, %b
+  %cond = select i1 %cmp, half %x.half, half %y.half
+  %0 = bitcast half %cond to i16
+  %1 = zext i16 %0 to i32
+  ret i32 %1
+}
+
diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
new file mode 100644
index 0000000000000..8d4a6376a9771
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
@@ -0,0 +1,557 @@
+; RUN: llc -mtriple=armv8a-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SP,CHECK-DP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOSP,CHECK-NODP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - -mattr=fp-armv8 | FileCheck %s --check-prefixes=CHECK,CHECK-SP,CHECK-DP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - -mattr=fp-armv8sp | FileCheck %s --check-prefixes=CHECK,CHECK-SP,CHECK-NODP
+
+; Check that constrained fp intrinsics are correctly lowered. In particular
+; check that the valid combinations of single-precision and double-precision
+; hardware being present or absent work as expected (i.e. we get an instruction
+; when one is available, otherwise a libcall).
+
+; FIXME: Tests fails as various things in CodeGen and Target/ARM need fixing.
+; XFAIL: *
+
+
+; Single-precision intrinsics
+
+; CHECK-LABEL: add_f32:
+; CHECK-NOSP: bl __aeabi_fadd
+; CHECK-SP: vadd.f32
+define float @add_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: sub_f32:
+; CHECK-NOSP: bl __aeabi_fsub
+; CHECK-SP: vsub.f32
+define float @sub_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.fsub.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: mul_f32:
+; CHECK-NOSP: bl __aeabi_fmul
+; CHECK-SP: vmul.f32
+define float @mul_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.fmul.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: div_f32:
+; CHECK-NOSP: bl __aeabi_fdiv
+; CHECK-SP: vdiv.f32
+define float @div_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.fdiv.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: frem_f32:
+; CHECK: bl fmodf
+define float @frem_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.frem.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: fma_f32:
+; CHECK-NOSP: bl fmaf
+; CHECK-SP: vfma.f32
+define float @fma_f32(float %x, float %y, float %z) #0 {
+  %val = call float @llvm.experimental.constrained.fma.f32(float %x, float %y, float %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: fptosi_f32:
+; CHECK-NOSP: bl __aeabi_f2iz
+; CHECK-SP: vcvt.s32.f32
+define i32 @fptosi_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.fptosi.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: fptoui_f32:
+; CHECK-NOSP: bl __aeabi_f2uiz
+; CHECK-SP: vcvt.u32.f32
+define i32 @fptoui_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.fptoui.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: sqrt_f32:
+; CHECK-NOSP: bl sqrtf
+; CHECK-SP: vsqrt.f32
+define float @sqrt_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.sqrt.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: powi_f32:
+; CHECK: bl __powisf2
+define float @powi_f32(float %x, i32 %y) #0 {
+  %val = call float @llvm.experimental.constrained.powi.f32(float %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: sin_f32:
+; CHECK: bl sinf
+define float @sin_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.sin.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: cos_f32:
+; CHECK: bl cosf
+define float @cos_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.cos.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: pow_f32:
+; CHECK: bl powf
+define float @pow_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.pow.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: log_f32:
+; CHECK: bl logf
+define float @log_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.log.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: log10_f32:
+; CHECK: bl log10f
+define float @log10_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.log10.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: log2_f32:
+; CHECK: bl log2f
+define float @log2_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.log2.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: exp_f32:
+; CHECK: bl expf
+define float @exp_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.exp.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: exp2_f32:
+; CHECK: bl exp2f
+define float @exp2_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.exp2.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: rint_f32:
+; CHECK-NOSP: bl rintf
+; CHECK-SP: vrintx.f32
+define float @rint_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.rint.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: nearbyint_f32:
+; CHECK-NOSP: bl nearbyintf
+; CHECK-SP: vrintr.f32
+define float @nearbyint_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.nearbyint.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: lrint_f32:
+; CHECK: bl lrintf
+define i32 @lrint_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.lrint.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: llrint_f32:
+; CHECK: bl llrintf
+define i32 @llrint_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.llrint.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: maxnum_f32:
+; CHECK-NOSP: bl fmaxf
+; CHECK-SP: vmaxnm.f32
+define float @maxnum_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.maxnum.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: minnum_f32:
+; CHECK-NOSP: bl fminf
+; CHECK-SP: vminnm.f32
+define float @minnum_f32(float %x, float %y) #0 {
+  %val = call float @llvm.experimental.constrained.minnum.f32(float %x, float %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: ceil_f32:
+; CHECK-NOSP: bl ceilf
+; CHECK-SP: vrintp.f32
+define float @ceil_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.ceil.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: floor_f32:
+; CHECK-NOSP: bl floorf
+; CHECK-SP: vrintm.f32
+define float @floor_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.floor.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: lround_f32:
+; CHECK: bl lroundf
+define i32 @lround_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.lround.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: llround_f32:
+; CHECK: bl llroundf
+define i32 @llround_f32(float %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.llround.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: round_f32:
+; CHECK-NOSP: bl roundf
+; CHECK-SP: vrinta.f32
+define float @round_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.round.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: trunc_f32:
+; CHECK-NOSP: bl truncf
+; CHECK-SP: vrintz.f32
+define float @trunc_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.trunc.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+
+; Double-precision intrinsics
+
+; CHECK-LABEL: add_f64:
+; CHECK-NODP: bl __aeabi_dadd
+; CHECK-DP: vadd.f64
+define double @add_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.fadd.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: sub_f64:
+; CHECK-NODP: bl __aeabi_dsub
+; CHECK-DP: vsub.f64
+define double @sub_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.fsub.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: mul_f64:
+; CHECK-NODP: bl __aeabi_dmul
+; CHECK-DP: vmul.f64
+define double @mul_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: div_f64:
+; CHECK-NODP: bl __aeabi_ddiv
+; CHECK-DP: vdiv.f64
+define double @div_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.fdiv.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: frem_f64:
+; CHECK: bl fmod
+define double @frem_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.frem.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: fma_f64:
+; CHECK-NODP: bl fma
+; CHECK-DP: vfma.f64
+define double @fma_f64(double %x, double %y, double %z) #0 {
+  %val = call double @llvm.experimental.constrained.fma.f64(double %x, double %y, double %z, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: fptosi_f64:
+; CHECK-NODP: bl __aeabi_d2iz
+; CHECK-DP: vcvt.s32.f64
+define i32 @fptosi_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.fptosi.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: fptoui_f64:
+; CHECK-NODP: bl __aeabi_d2uiz
+; CHECK-DP: vcvt.u32.f64
+define i32 @fptoui_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.fptoui.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: sqrt_f64:
+; CHECK-NODP: bl sqrt
+; CHECK-DP: vsqrt.f64
+define double @sqrt_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.sqrt.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: powi_f64:
+; CHECK: bl __powidf2
+define double @powi_f64(double %x, i32 %y) #0 {
+  %val = call double @llvm.experimental.constrained.powi.f64(double %x, i32 %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: sin_f64:
+; CHECK: bl sin
+define double @sin_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.sin.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: cos_f64:
+; CHECK: bl cos
+define double @cos_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.cos.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: pow_f64:
+; CHECK: bl pow
+define double @pow_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.pow.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: log_f64:
+; CHECK: bl log
+define double @log_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.log.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: log10_f64:
+; CHECK: bl log10
+define double @log10_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.log10.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: log2_f64:
+; CHECK: bl log2
+define double @log2_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.log2.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: exp_f64:
+; CHECK: bl exp
+define double @exp_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.exp.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: exp2_f64:
+; CHECK: bl exp2
+define double @exp2_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.exp2.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: rint_f64:
+; CHECK-NODP: bl rint
+; CHECK-DP: vrintx.f64
+define double @rint_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.rint.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: nearbyint_f64:
+; CHECK-NODP: bl nearbyint
+; CHECK-DP: vrintr.f64
+define double @nearbyint_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.nearbyint.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: lrint_f64:
+; CHECK: bl lrint
+define i32 @lrint_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.lrint.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: llrint_f64:
+; CHECK: bl llrint
+define i32 @llrint_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.llrint.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: maxnum_f64:
+; CHECK-NODP: bl fmax
+; CHECK-DP: vmaxnm.f64
+define double @maxnum_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.maxnum.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: minnum_f64:
+; CHECK-NODP: bl fmin
+; CHECK-DP: vminnm.f64
+define double @minnum_f64(double %x, double %y) #0 {
+  %val = call double @llvm.experimental.constrained.minnum.f64(double %x, double %y, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: ceil_f64:
+; CHECK-NODP: bl ceil
+; CHECK-DP: vrintp.f64
+define double @ceil_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.ceil.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: floor_f64:
+; CHECK-NODP: bl floor
+; CHECK-DP: vrintm.f64
+define double @floor_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.floor.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: lround_f64:
+; CHECK: bl lround
+define i32 @lround_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.lround.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: llround_f64:
+; CHECK: bl llround
+define i32 @llround_f64(double %x) #0 {
+  %val = call i32 @llvm.experimental.constrained.llround.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %val
+}
+
+; CHECK-LABEL: round_f64:
+; CHECK-NODP: bl round
+; CHECK-DP: vrinta.f64
+define double @round_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.round.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+; CHECK-LABEL: trunc_f64:
+; CHECK-NODP: bl trunc
+; CHECK-DP: vrintz.f64
+define double @trunc_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.trunc.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+
+; Single/Double conversion intrinsics
+
+; CHECK-LABEL: fptrunc_f32:
+; CHECK-NODP: bl __aeabi_d2f
+; CHECK-DP: vcvt.f32.f64
+define float @fptrunc_f32(double %x) #0 {
+  %val = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
+; CHECK-LABEL: fpext_f32:
+; CHECK-NODP: bl __aeabi_f2d
+; CHECK-DP: vcvt.f64.f32
+define double @fpext_f32(float %x) #0 {
+  %val = call double @llvm.experimental.constrained.fpext.f64.f32(float %x, metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
+
+attributes #0 = { strictfp }
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fsub.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.frem.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.f32(float, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.f32(float, metadata)
+declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, metadata)
+declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.log2.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.exp.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.exp2.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.rint.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.nearbyint.f32(float, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lrint.f32(float, metadata, metadata)
+declare i32 @llvm.experimental.constrained.llrint.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.maxnum.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.minnum.f32(float, float, metadata, metadata)
+declare float @llvm.experimental.constrained.ceil.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.floor.f32(float, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lround.f32(float, metadata)
+declare i32 @llvm.experimental.constrained.llround.f32(float, metadata)
+declare float @llvm.experimental.constrained.round.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.trunc.f32(float, metadata, metadata)
+
+declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.frem.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.f64(double, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.f64(double, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lrint.f64(double, metadata, metadata)
+declare i32 @llvm.experimental.constrained.llrint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.maxnum.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.minnum.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.ceil.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.floor.f64(double, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lround.f64(double, metadata)
+declare i32 @llvm.experimental.constrained.llround.f64(double, metadata)
+declare double @llvm.experimental.constrained.round.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.trunc.f64(double, metadata, metadata)
+
+declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
diff --git a/llvm/test/CodeGen/ARM/neon-v8.1a.ll b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
index 91259139d4463..95d2085800810 100644
--- a/llvm/test/CodeGen/ARM/neon-v8.1a.ll
+++ b/llvm/test/CodeGen/ARM/neon-v8.1a.ll
@@ -8,20 +8,20 @@ declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
 declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
 declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
 
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>)
 
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>)
 
 define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_vqrdmlah_v4i16:
    %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
-   %retval =  call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+   %retval =  call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
 ; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <4 x i16> %retval
 }
@@ -29,7 +29,7 @@ define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16>
 define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_vqrdmlah_v8i16:
    %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
-   %retval =  call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+   %retval =  call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 ; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <8 x i16> %retval
 }
@@ -37,7 +37,7 @@ define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16>
 define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_vqrdmlah_v2i32:
    %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
-   %retval =  call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+   %retval =  call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 ; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <2 x i32> %retval
 }
@@ -45,7 +45,7 @@ define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32>
 define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_vqrdmlah_v4i32:
    %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
-   %retval =  call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+   %retval =  call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 ; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <4 x i32> %retval
 }
@@ -53,7 +53,7 @@ define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32>
 define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
 ; CHECK-LABEL: test_vqrdmlsh_v4i16:
    %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
-   %retval =  call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+   %retval =  call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 ; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <4 x i16> %retval
 }
@@ -61,7 +61,7 @@ define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16>
 define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
 ; CHECK-LABEL: test_vqrdmlsh_v8i16:
    %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
-   %retval =  call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+   %retval =  call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 ; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <8 x i16> %retval
 }
@@ -69,7 +69,7 @@ define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16>
 define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
 ; CHECK-LABEL: test_vqrdmlsh_v2i32:
    %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
-   %retval =  call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+   %retval =  call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 ; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
    ret <2 x i32> %retval
 }
@@ -77,7 +77,7 @@ define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32>
 define <4 x i32> @test_vqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: test_vqrdmlsh_v4i32:
    %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
-   %retval =  call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+   %retval =  call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 ; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
    ret <4 x i32> %retval
 }
@@ -90,7 +90,7 @@ define <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16>
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
-  %retval =  call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+  %retval =  call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 ; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
   ret <4 x i16> %retval
 }
@@ -100,7 +100,7 @@ define <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
-  %retval =  call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+  %retval =  call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 ; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
   ret <8 x i16> %retval
 }
@@ -110,7 +110,7 @@ define <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32>
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
-  %retval =  call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+  %retval =  call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 ; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
   ret <2 x i32> %retval
 }
@@ -120,7 +120,7 @@ define <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32>
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
-  %retval =  call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+  %retval =  call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 ; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
   ret <4 x i32> %retval
 }
@@ -130,7 +130,7 @@ define <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16>
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
-  %retval =  call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+  %retval =  call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %acc, <4 x i16> %prod)
 ; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
   ret <4 x i16> %retval
 }
@@ -140,7 +140,7 @@ define <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16
 entry:
   %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
-  %retval =  call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+  %retval =  call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %acc, <8 x i16> %prod)
 ; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
   ret <8 x i16> %retval
 }
@@ -150,7 +150,7 @@ define <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32>
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
   %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
-  %retval =  call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+  %retval =  call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %acc, <2 x i32> %prod)
 ; CHECK: vqrdmlsh.s32  {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
   ret <2 x i32> %retval
 }
@@ -160,7 +160,7 @@ define <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32>
 entry:
   %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
   %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
-  %retval =  call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+  %retval =  call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %acc, <4 x i32> %prod)
 ; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
   ret <4 x i32> %retval
 }
diff --git a/llvm/test/CodeGen/ARM/neon-vcadd.ll b/llvm/test/CodeGen/ARM/neon-vcadd.ll
new file mode 100644
index 0000000000000..93a85c8c73c6c
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-vcadd.ll
@@ -0,0 +1,54 @@
+; RUN: llc %s -mtriple=arm -mattr=+armv8.3-a,+fullfp16 -o - | FileCheck %s
+
+define <4 x half> @foo16x4_rot(<4 x half> %a, <4 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x4_rot
+; CHECK-DAG: vcadd.f16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #270
+  %vcadd_rot90_v2.i = tail call <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16(<4 x half> %a, <4 x half> %b)
+  %vcadd_rot270_v2.i = tail call <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16(<4 x half> %a, <4 x half> %b)
+  %add = fadd <4 x half> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+  ret <4 x half> %add
+}
+
+define <2 x float> @foo32x2_rot(<2 x float> %a, <2 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x2_rot
+; CHECK-DAG: vcadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #270
+  %vcadd_rot90_v2.i = tail call <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32(<2 x float> %a, <2 x float> %b)
+  %vcadd_rot270_v2.i = tail call <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32(<2 x float> %a, <2 x float> %b)
+  %add = fadd <2 x float> %vcadd_rot90_v2.i, %vcadd_rot270_v2.i
+  ret <2 x float> %add
+}
+
+define <8 x half> @foo16x8_rot(<8 x half> %a, <8 x half> %b) {
+entry:
+; CHECK-LABEL: foo16x8_rot
+; CHECK-DAG: vcadd.f16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270
+  %vcaddq_rot90_v2.i = tail call <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16(<8 x half> %a, <8 x half> %b)
+  %vcaddq_rot270_v2.i = tail call <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16(<8 x half> %a, <8 x half> %b)
+  %add = fadd <8 x half> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+  ret <8 x half> %add
+}
+
+define <4 x float> @foo32x4_rot(<4 x float> %a, <4 x float> %b) {
+entry:
+; CHECK-LABEL: foo32x4_rot
+; CHECK-DAG: vcadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #90
+; CHECK-DAG: vcadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #270
+  %vcaddq_rot90_v2.i = tail call <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32(<4 x float> %a, <4 x float> %b)
+  %vcaddq_rot270_v2.i = tail call <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32(<4 x float> %a, <4 x float> %b)
+  %add = fadd <4 x float> %vcaddq_rot90_v2.i, %vcaddq_rot270_v2.i
+  ret <4 x float> %add
+}
+
+declare <4 x half> @llvm.arm.neon.vcadd.rot90.v4f16(<4 x half>, <4 x half>)
+declare <4 x half> @llvm.arm.neon.vcadd.rot270.v4f16(<4 x half>, <4 x half>)
+declare <2 x float> @llvm.arm.neon.vcadd.rot90.v2f32(<2 x float>, <2 x float>)
+declare <2 x float> @llvm.arm.neon.vcadd.rot270.v2f32(<2 x float>, <2 x float>)
+declare <8 x half> @llvm.arm.neon.vcadd.rot90.v8f16(<8 x half>, <8 x half>)
+declare <8 x half> @llvm.arm.neon.vcadd.rot270.v8f16(<8 x half>, <8 x half>)
+declare <4 x float> @llvm.arm.neon.vcadd.rot90.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.arm.neon.vcadd.rot270.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll
new file mode 100644
index 0000000000000..a1323810151a5
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/neon-vqaddsub-upgrade.ll
@@ -0,0 +1,330 @@
+; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
+
+define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqadds8:
+;CHECK: vqadd.s8
+	%tmp1 = load <8 x i8>, <8 x i8>* %A
+	%tmp2 = load <8 x i8>, <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqadds16:
+;CHECK: vqadd.s16
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqadds32:
+;CHECK: vqadd.s32
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+	%tmp2 = load <2 x i32>, <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqadds64:
+;CHECK: vqadd.s64
+	%tmp1 = load <1 x i64>, <1 x i64>* %A
+	%tmp2 = load <1 x i64>, <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddu8:
+;CHECK: vqadd.u8
+	%tmp1 = load <8 x i8>, <8 x i8>* %A
+	%tmp2 = load <8 x i8>, <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddu16:
+;CHECK: vqadd.u16
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddu32:
+;CHECK: vqadd.u32
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+	%tmp2 = load <2 x i32>, <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddu64:
+;CHECK: vqadd.u64
+	%tmp1 = load <1 x i64>, <1 x i64>* %A
+	%tmp2 = load <1 x i64>, <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddQs8:
+;CHECK: vqadd.s8
+	%tmp1 = load <16 x i8>, <16 x i8>* %A
+	%tmp2 = load <16 x i8>, <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddQs16:
+;CHECK: vqadd.s16
+	%tmp1 = load <8 x i16>, <8 x i16>* %A
+	%tmp2 = load <8 x i16>, <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddQs32:
+;CHECK: vqadd.s32
+	%tmp1 = load <4 x i32>, <4 x i32>* %A
+	%tmp2 = load <4 x i32>, <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddQs64:
+;CHECK: vqadd.s64
+	%tmp1 = load <2 x i64>, <2 x i64>* %A
+	%tmp2 = load <2 x i64>, <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqaddQu8:
+;CHECK: vqadd.u8
+	%tmp1 = load <16 x i8>, <16 x i8>* %A
+	%tmp2 = load <16 x i8>, <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqaddQu16:
+;CHECK: vqadd.u16
+	%tmp1 = load <8 x i16>, <8 x i16>* %A
+	%tmp2 = load <8 x i16>, <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqaddQu32:
+;CHECK: vqadd.u32
+	%tmp1 = load <4 x i32>, <4 x i32>* %A
+	%tmp2 = load <4 x i32>, <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqaddQu64:
+;CHECK: vqadd.u64
+	%tmp1 = load <2 x i64>, <2 x i64>* %A
+	%tmp2 = load <2 x i64>, <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+
+define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubs8:
+;CHECK: vqsub.s8
+	%tmp1 = load <8 x i8>, <8 x i8>* %A
+	%tmp2 = load <8 x i8>, <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubs16:
+;CHECK: vqsub.s16
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubs32:
+;CHECK: vqsub.s32
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+	%tmp2 = load <2 x i32>, <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubs64:
+;CHECK: vqsub.s64
+	%tmp1 = load <1 x i64>, <1 x i64>* %A
+	%tmp2 = load <1 x i64>, <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubu8:
+;CHECK: vqsub.u8
+	%tmp1 = load <8 x i8>, <8 x i8>* %A
+	%tmp2 = load <8 x i8>, <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubu16:
+;CHECK: vqsub.u16
+	%tmp1 = load <4 x i16>, <4 x i16>* %A
+	%tmp2 = load <4 x i16>, <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubu32:
+;CHECK: vqsub.u32
+	%tmp1 = load <2 x i32>, <2 x i32>* %A
+	%tmp2 = load <2 x i32>, <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubu64:
+;CHECK: vqsub.u64
+	%tmp1 = load <1 x i64>, <1 x i64>* %A
+	%tmp2 = load <1 x i64>, <1 x i64>* %B
+	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubQs8:
+;CHECK: vqsub.s8
+	%tmp1 = load <16 x i8>, <16 x i8>* %A
+	%tmp2 = load <16 x i8>, <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubQs16:
+;CHECK: vqsub.s16
+	%tmp1 = load <8 x i16>, <8 x i16>* %A
+	%tmp2 = load <8 x i16>, <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubQs32:
+;CHECK: vqsub.s32
+	%tmp1 = load <4 x i32>, <4 x i32>* %A
+	%tmp2 = load <4 x i32>, <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubQs64:
+;CHECK: vqsub.s64
+	%tmp1 = load <2 x i64>, <2 x i64>* %A
+	%tmp2 = load <2 x i64>, <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vqsubQu8:
+;CHECK: vqsub.u8
+	%tmp1 = load <16 x i8>, <16 x i8>* %A
+	%tmp2 = load <16 x i8>, <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vqsubQu16:
+;CHECK: vqsub.u16
+	%tmp1 = load <8 x i16>, <8 x i16>* %A
+	%tmp2 = load <8 x i16>, <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vqsubQu32:
+;CHECK: vqsub.u32
+	%tmp1 = load <4 x i32>, <4 x i32>* %A
+	%tmp2 = load <4 x i32>, <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vqsubQu64:
+;CHECK: vqsub.u64
+	%tmp1 = load <2 x i64>, <2 x i64>* %A
+	%tmp2 = load <2 x i64>, <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/reg-alloc-fixed-r6-vla.ll b/llvm/test/CodeGen/ARM/reg-alloc-fixed-r6-vla.ll
deleted file mode 100644
index 0b6fd7443af29..0000000000000
--- a/llvm/test/CodeGen/ARM/reg-alloc-fixed-r6-vla.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; Using VLAs(Variable Length Arrays) in a function will use R6 to keep track
-; of the stack frame, and also spill/restore R6 to the stack.
-; This tests that using -ffixed-r6 (-mattr=+reserve-r6) will stop R6
-; being used and also stop it being spilled/restored to the stack.
-; RUN: llc < %s -mcpu=cortex-m0 -mtriple=thumbv7-arm-none-eabi  | FileCheck %s --check-prefix=CHECK-STATIC --check-prefix=CHECK-R6
-; RUN: llc < %s -mcpu=cortex-m0 -mtriple=thumbv7-arm-none-eabi -mattr=+reserve-r6  | FileCheck %s --check-prefix=CHECK-STATIC --check-prefix=CHECK-NO-R6
-
-define void @f() #0 {
-entry:
-  %i = alloca i32, align 4
-  store i32 0, i32* %i, align 4
-
-  %saved_stack = alloca i8*, align 4
-  %0 = call i8* @llvm.stacksave()
-  store i8* %0, i8** %saved_stack, align 4
-
-  %__vla_expr0 = alloca i32, align 4
-  %1 = load i32, i32* %i, align 4
-  %vla = alloca double, i32 %1, align 8
-  store i32 %1, i32* %__vla_expr0, align 4
-
-  %2 = load i8*, i8** %saved_stack, align 4
-  call void @llvm.stackrestore(i8* %2)
-
-  ret void
-}
-
-declare i8* @llvm.stacksave() #1
-declare void @llvm.stackrestore(i8* %ptr) #1
-
-attributes #0 = { noinline nounwind "stackrealign" }
-attributes #1 = { nounwind }
-
-; CHECK-STATIC: push {r4,
-; CHECK-R6: r6
-; CHECK-NO-R6-NOT: r6
-; CHECK-STATIC: lr}
-; CHECK-R6: r6
-; CHECK-NO-R6-NOT: r6
-; CHECK-STATIC: pop {r4,
-; CHECK-R6: r6
-; CHECK-NO-R6-NOT: r6
-; CHECK-STATIC: pc}
-
diff --git a/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6-modified.ll b/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6-modified.ll
deleted file mode 100644
index e2a4af87dde7e..0000000000000
--- a/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6-modified.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: llc < %s -mattr=+reserve-r6 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
-;
-; Equivalent C source code
-; register unsigned r6 asm("r6");
-; void bar(unsigned int i,
-;          unsigned int j,
-;          unsigned int k,
-;          unsigned int l,
-;          unsigned int m,
-;          unsigned int n,
-;          unsigned int o,
-;          unsigned int p)
-; {
-;     r6 = 10;
-;     unsigned int result = i + j + k + l + m + n + o + p;
-; }
-declare void @llvm.write_register.i32(metadata, i32) nounwind
-
-define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
-entry:
-; CHECK-NOT: push {{{.*}}r6,{{.*}}}
-; CHECK: {{.*}}mov{{.*}}r6,{{.*}}
-; CHECK-NOT: {{.*}}r6{{.*}}
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  %k.addr = alloca i32, align 4
-  %l.addr = alloca i32, align 4
-  %m.addr = alloca i32, align 4
-  %n.addr = alloca i32, align 4
-  %o.addr = alloca i32, align 4
-  %p.addr = alloca i32, align 4
-  %result = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  store i32 %k, i32* %k.addr, align 4
-  store i32 %l, i32* %l.addr, align 4
-  store i32 %m, i32* %m.addr, align 4
-  store i32 %n, i32* %n.addr, align 4
-  store i32 %o, i32* %o.addr, align 4
-  store i32 %p, i32* %p.addr, align 4
-  call void @llvm.write_register.i32(metadata !0, i32 10)
-  %0 = load i32, i32* %i.addr, align 4
-  %1 = load i32, i32* %j.addr, align 4
-  %add = add i32 %0, %1
-  %2 = load i32, i32* %k.addr, align 4
-  %add1 = add i32 %add, %2
-  %3 = load i32, i32* %l.addr, align 4
-  %add2 = add i32 %add1, %3
-  %4 = load i32, i32* %m.addr, align 4
-  %add3 = add i32 %add2, %4
-  %5 = load i32, i32* %n.addr, align 4
-  %add4 = add i32 %add3, %5
-  %6 = load i32, i32* %o.addr, align 4
-  %add5 = add i32 %add4, %6
-  %7 = load i32, i32* %p.addr, align 4
-  %add6 = add i32 %add5, %7
-  store i32 %add6, i32* %result, align 4
-  ret void
-}
-
-!llvm.named.register.r6 = !{!0}
-!0 = !{!"r6"}
-
diff --git a/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6.ll b/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6.ll
deleted file mode 100644
index 3647c0701a7c3..0000000000000
--- a/llvm/test/CodeGen/ARM/reg-alloc-with-fixed-reg-r6.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; RUN: llc < %s -mattr=+reserve-r6 -mtriple=arm-linux-gnueabi -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
-;
-; Equivalent C source code
-; void bar(unsigned int i,
-;          unsigned int j,
-;          unsigned int k,
-;          unsigned int l,
-;          unsigned int m,
-;          unsigned int n,
-;          unsigned int o,
-;          unsigned int p)
-; {
-;     unsigned int result = i + j + k + l + m + n + o + p;
-; }
-
-define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
-entry:
-; CHECK-NOT: push {{{.*}}r6,{{.*}}}
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  %k.addr = alloca i32, align 4
-  %l.addr = alloca i32, align 4
-  %m.addr = alloca i32, align 4
-  %n.addr = alloca i32, align 4
-  %o.addr = alloca i32, align 4
-  %p.addr = alloca i32, align 4
-  %result = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  store i32 %k, i32* %k.addr, align 4
-  store i32 %l, i32* %l.addr, align 4
-  store i32 %m, i32* %m.addr, align 4
-  store i32 %n, i32* %n.addr, align 4
-  store i32 %o, i32* %o.addr, align 4
-  store i32 %p, i32* %p.addr, align 4
-  %0 = load i32, i32* %i.addr, align 4
-  %1 = load i32, i32* %j.addr, align 4
-  %add = add i32 %0, %1
-  %2 = load i32, i32* %k.addr, align 4
-  %add1 = add i32 %add, %2
-  %3 = load i32, i32* %l.addr, align 4
-  %add2 = add i32 %add1, %3
-  %4 = load i32, i32* %m.addr, align 4
-  %add3 = add i32 %add2, %4
-  %5 = load i32, i32* %n.addr, align 4
-  %add4 = add i32 %add3, %5
-  %6 = load i32, i32* %o.addr, align 4
-  %add5 = add i32 %add4, %6
-  %7 = load i32, i32* %p.addr, align 4
-  %add6 = add i32 %add5, %7
-  store i32 %add6, i32* %result, align 4
-; CHECK: {{.*}}r5{{.*}}
-; CHECK-NOT: {{.*}}r6{{.*}}
-  ret void
-; CHECK-NOT: pop {{{.*}}r6,{{.*}}}
-}
-
diff --git a/llvm/test/CodeGen/ARM/reg-alloc-wout-fixed-regs.ll b/llvm/test/CodeGen/ARM/reg-alloc-wout-fixed-regs.ll
deleted file mode 100644
index d1f020936a3d6..0000000000000
--- a/llvm/test/CodeGen/ARM/reg-alloc-wout-fixed-regs.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc < %s -mtriple=arm-linux-gnueabi  -O0 -filetype=asm --regalloc=fast 2>&1 | FileCheck %s
-;
-; Equivalent C source code
-; void bar(unsigned int i,
-;          unsigned int j,
-;          unsigned int k,
-;          unsigned int l,
-;          unsigned int m,
-;          unsigned int n,
-;          unsigned int o,
-;          unsigned int p)
-; {
-;     unsigned int result = i + j + k + l + m + n + o + p;
-; }
-
-define void @bar(i32 %i, i32 %j, i32 %k, i32 %l, i32 %m, i32 %n, i32 %o, i32 %p) nounwind {
-entry:
-; CHECK: push {{{.*}}r4, r5{{.*}}}
-  %i.addr = alloca i32, align 4
-  %j.addr = alloca i32, align 4
-  %k.addr = alloca i32, align 4
-  %l.addr = alloca i32, align 4
-  %m.addr = alloca i32, align 4
-  %n.addr = alloca i32, align 4
-  %o.addr = alloca i32, align 4
-  %p.addr = alloca i32, align 4
-  %result = alloca i32, align 4
-  store i32 %i, i32* %i.addr, align 4
-  store i32 %j, i32* %j.addr, align 4
-  store i32 %k, i32* %k.addr, align 4
-  store i32 %l, i32* %l.addr, align 4
-  store i32 %m, i32* %m.addr, align 4
-  store i32 %n, i32* %n.addr, align 4
-  store i32 %o, i32* %o.addr, align 4
-  store i32 %p, i32* %p.addr, align 4
-  %0 = load i32, i32* %i.addr, align 4
-  %1 = load i32, i32* %j.addr, align 4
-  %add = add i32 %0, %1
-  %2 = load i32, i32* %k.addr, align 4
-  %add1 = add i32 %add, %2
-  %3 = load i32, i32* %l.addr, align 4
-  %add2 = add i32 %add1, %3
-  %4 = load i32, i32* %m.addr, align 4
-  %add3 = add i32 %add2, %4
-  %5 = load i32, i32* %n.addr, align 4
-  %add4 = add i32 %add3, %5
-  %6 = load i32, i32* %o.addr, align 4
-  %add5 = add i32 %add4, %6
-  %7 = load i32, i32* %p.addr, align 4
-  %add6 = add i32 %add5, %7
-  store i32 %add6, i32* %result, align 4
-; CHECK: {{.*}}r4{{.*}}
-; CHECK: {{.*}}r5{{.*}}
-
-; CHECK: pop {{{.*}}r4, r5{{.*}}}
-  ret void
-}
-
diff --git a/llvm/test/CodeGen/ARM/vmul.ll b/llvm/test/CodeGen/ARM/vmul.ll
index fcffe175e2bac..e8cf8d9b27b6f 100644
--- a/llvm/test/CodeGen/ARM/vmul.ll
+++ b/llvm/test/CodeGen/ARM/vmul.ll
@@ -574,7 +574,7 @@ for.body33:                                       ; preds = %for.body33, %for.bo
   %vmovl.i225 = zext <8 x i8> undef to <8 x i16>
   %mul.i223 = mul <8 x i16> %vmovl.i249, %vmovl.i249
   %vshl_n = shl <8 x i16> %mul.i223, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
-  %vqsub2.i216 = tail call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
+  %vqsub2.i216 = tail call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vshl_n) nounwind
   %mul.i209 = mul <8 x i16> undef, <i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80, i16 80>
   %vshr_n130 = lshr <8 x i16> undef, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
   %vshr_n134 = lshr <8 x i16> %mul.i209, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -608,7 +608,7 @@ for.end179:                                       ; preds = %for.cond.loopexit,
 }
 
 declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
 
 ; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8),
diff --git a/llvm/test/CodeGen/ARM/vqadd.ll b/llvm/test/CodeGen/ARM/vqadd.ll
index d1e90cb209449..47432c7b732d6 100644
--- a/llvm/test/CodeGen/ARM/vqadd.ll
+++ b/llvm/test/CodeGen/ARM/vqadd.ll
@@ -5,7 +5,7 @@ define <8 x i8> @vqadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vqadd.s8
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @vqadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vqadd.s16
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @vqadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vqadd.s32
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <1 x i64> @vqadds64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vqadd.s64
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
-	%tmp3 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	%tmp3 = call <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
 	ret <1 x i64> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <8 x i8> @vqaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vqadd.u8
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i16> @vqaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vqadd.u16
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <2 x i32> @vqaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vqadd.u32
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <1 x i64> @vqaddu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vqadd.u64
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
-	%tmp3 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	%tmp3 = call <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
 	ret <1 x i64> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <16 x i8> @vqaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vqadd.s8
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = load <16 x i8>, <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @vqaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vqadd.s16
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = load <8 x i16>, <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <4 x i32> @vqaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vqadd.s32
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = load <4 x i32>, <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <2 x i64> @vqaddQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vqadd.s64
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = load <2 x i64>, <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <16 x i8> @vqaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vqadd.u8
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = load <16 x i8>, <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -122,7 +122,7 @@ define <8 x i16> @vqaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vqadd.u16
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = load <8 x i16>, <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -131,7 +131,7 @@ define <4 x i32> @vqaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vqadd.u32
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = load <4 x i32>, <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -140,26 +140,26 @@ define <2 x i64> @vqaddQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vqadd.u64
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = load <2 x i64>, <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.sadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.sadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.uadd.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.uadd.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/vqdmul.ll b/llvm/test/CodeGen/ARM/vqdmul.ll
index 6da080012a1e6..fa938d45becfb 100644
--- a/llvm/test/CodeGen/ARM/vqdmul.ll
+++ b/llvm/test/CodeGen/ARM/vqdmul.ll
@@ -204,7 +204,7 @@ define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C
         %tmp2 = load <4 x i16>, <4 x i16>* %B
         %tmp3 = load <4 x i16>, <4 x i16>* %C
         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
-        %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        %tmp5 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
         ret <4 x i32> %tmp5
 }
 
@@ -215,7 +215,7 @@ define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C
         %tmp2 = load <2 x i32>, <2 x i32>* %B
         %tmp3 = load <2 x i32>, <2 x i32>* %C
         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
-        %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        %tmp5 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
         ret <2 x i64> %tmp5
 }
 
@@ -225,7 +225,7 @@ entry:
 ; CHECK: vqdmlal.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
-  %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  %2 = tail call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
   ret <4 x i32> %2
 }
 
@@ -235,12 +235,12 @@ entry:
 ; CHECK: vqdmlal.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
-  %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  %2 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
   ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i32>  @llvm.sadd.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.sadd.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
 ;CHECK-LABEL: vqdmlsls16_natural:
@@ -249,7 +249,7 @@ define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C
         %tmp2 = load <4 x i16>, <4 x i16>* %B
         %tmp3 = load <4 x i16>, <4 x i16>* %C
         %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3)
-        %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
+        %tmp5 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4)
         ret <4 x i32> %tmp5
 }
 
@@ -260,7 +260,7 @@ define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C
         %tmp2 = load <2 x i32>, <2 x i32>* %B
         %tmp3 = load <2 x i32>, <2 x i32>* %C
         %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3)
-        %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
+        %tmp5 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4)
         ret <2 x i64> %tmp5
 }
 
@@ -270,7 +270,7 @@ entry:
 ; CHECK: vqdmlsl.s16 q0, d2, d3[1]
   %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1]
   %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0)
-  %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
+  %2 = tail call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1)
   ret <4 x i32> %2
 }
 
@@ -280,9 +280,9 @@ entry:
 ; CHECK: vqdmlsl.s32 q0, d2, d3[1]
   %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1]
   %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0)
-  %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
+  %2 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1)
   ret <2 x i64> %2
 }
 
-declare <4 x i32>  @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64>  @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i32>  @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64>  @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/ARM/vqsub.ll b/llvm/test/CodeGen/ARM/vqsub.ll
index 40963ce824864..9864f6421cb3d 100644
--- a/llvm/test/CodeGen/ARM/vqsub.ll
+++ b/llvm/test/CodeGen/ARM/vqsub.ll
@@ -5,7 +5,7 @@ define <8 x i8> @vqsubs8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vqsub.s8
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -14,7 +14,7 @@ define <4 x i16> @vqsubs16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vqsub.s16
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -23,7 +23,7 @@ define <2 x i32> @vqsubs32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vqsub.s32
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -32,7 +32,7 @@ define <1 x i64> @vqsubs64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vqsub.s64
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
-	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	%tmp3 = call <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
 	ret <1 x i64> %tmp3
 }
 
@@ -41,7 +41,7 @@ define <8 x i8> @vqsubu8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
 ;CHECK: vqsub.u8
 	%tmp1 = load <8 x i8>, <8 x i8>* %A
 	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	%tmp3 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
 	ret <8 x i8> %tmp3
 }
 
@@ -50,7 +50,7 @@ define <4 x i16> @vqsubu16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
 ;CHECK: vqsub.u16
 	%tmp1 = load <4 x i16>, <4 x i16>* %A
 	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	%tmp3 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
 	ret <4 x i16> %tmp3
 }
 
@@ -59,7 +59,7 @@ define <2 x i32> @vqsubu32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 ;CHECK: vqsub.u32
 	%tmp1 = load <2 x i32>, <2 x i32>* %A
 	%tmp2 = load <2 x i32>, <2 x i32>* %B
-	%tmp3 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	%tmp3 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
 	ret <2 x i32> %tmp3
 }
 
@@ -68,7 +68,7 @@ define <1 x i64> @vqsubu64(<1 x i64>* %A, <1 x i64>* %B) nounwind {
 ;CHECK: vqsub.u64
 	%tmp1 = load <1 x i64>, <1 x i64>* %A
 	%tmp2 = load <1 x i64>, <1 x i64>* %B
-	%tmp3 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
+	%tmp3 = call <1 x i64> @llvm.usub.sat.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2)
 	ret <1 x i64> %tmp3
 }
 
@@ -77,7 +77,7 @@ define <16 x i8> @vqsubQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vqsub.s8
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = load <16 x i8>, <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -86,7 +86,7 @@ define <8 x i16> @vqsubQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vqsub.s16
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = load <8 x i16>, <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -95,7 +95,7 @@ define <4 x i32> @vqsubQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vqsub.s32
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = load <4 x i32>, <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -104,7 +104,7 @@ define <2 x i64> @vqsubQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vqsub.s64
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = load <2 x i64>, <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
@@ -113,7 +113,7 @@ define <16 x i8> @vqsubQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK: vqsub.u8
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
 	%tmp2 = load <16 x i8>, <16 x i8>* %B
-	%tmp3 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	%tmp3 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
 	ret <16 x i8> %tmp3
 }
 
@@ -122,7 +122,7 @@ define <8 x i16> @vqsubQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK: vqsub.u16
 	%tmp1 = load <8 x i16>, <8 x i16>* %A
 	%tmp2 = load <8 x i16>, <8 x i16>* %B
-	%tmp3 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	%tmp3 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
 	ret <8 x i16> %tmp3
 }
 
@@ -131,7 +131,7 @@ define <4 x i32> @vqsubQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK: vqsub.u32
 	%tmp1 = load <4 x i32>, <4 x i32>* %A
 	%tmp2 = load <4 x i32>, <4 x i32>* %B
-	%tmp3 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	%tmp3 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
 	ret <4 x i32> %tmp3
 }
 
@@ -140,26 +140,26 @@ define <2 x i64> @vqsubQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
 ;CHECK: vqsub.u64
 	%tmp1 = load <2 x i64>, <2 x i64>* %A
 	%tmp2 = load <2 x i64>, <2 x i64>* %B
-	%tmp3 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	%tmp3 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
 	ret <2 x i64> %tmp3
 }
 
-declare <8 x i8>  @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.ssub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.ssub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <8 x i8>  @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
-declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
-declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
-declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+declare <8 x i8>  @llvm.usub.sat.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.usub.sat.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
-declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
-declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
-declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
-declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+declare <16 x i8> @llvm.usub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
new file mode 100644
index 0000000000000..8851c502b6f0b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=bpfel -filetype=obj -o - %s | llvm-readelf -s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=obj -o - %s | llvm-readelf -s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfel -filetype=obj -addrsig -o - %s | llvm-readelf -s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfeb -filetype=obj -addrsig -o - %s | llvm-readelf -s | FileCheck -check-prefixes=CHECK %s
+;
+; Source Code:
+;   struct tt { int a; } __attribute__((preserve_access_index));
+;   int test(struct tt *arg) {
+;     return arg->a;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -g -S -emit-llvm t.c
+
+%struct.tt = type { i32 }
+
+; Function Attrs: nounwind readonly
+define dso_local i32 @test(%struct.tt* readonly %arg) local_unnamed_addr #0 !dbg !7 {
+entry:
+  call void @llvm.dbg.value(metadata %struct.tt* %arg, metadata !16, metadata !DIExpression()), !dbg !17
+  %0 = tail call i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.tts(%struct.tt* %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
+  %1 = load i32, i32* %0, align 4, !dbg !18, !tbaa !19
+  ret i32 %1, !dbg !24
+}
+
+; CHECK-NOT: llvm.tt:0:0$0:0
+
+; Function Attrs: nounwind readnone
+declare i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.tts(%struct.tt*, i32, i32) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, metadata, metadata) #2
+
+attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readnone speculatable}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 (https://github.com/llvm/llvm-project.git 947f9692440836dcb8d88b74b69dd379d85974ce)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/work/tests/bug")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 947f9692440836dcb8d88b74b69dd379d85974ce)"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "tt", file: !1, line: 1, size: 32, elements: !13)
+!13 = !{!14}
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !12, file: !1, line: 1, baseType: !10, size: 32)
+!15 = !{!16}
+!16 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 2, type: !11)
+!17 = !DILocation(line: 0, scope: !7)
+!18 = !DILocation(line: 3, column: 15, scope: !7)
+!19 = !{!20, !21, i64 0}
+!20 = !{!"tt", !21, i64 0}
+!21 = !{!"int", !22, i64 0}
+!22 = !{!"omnipotent char", !23, i64 0}
+!23 = !{!"Simple C/C++ TBAA"}
+!24 = !DILocation(line: 3, column: 3, scope: !7)
diff --git a/llvm/test/CodeGen/MIR/X86/mir-canon-hash-bb.mir b/llvm/test/CodeGen/MIR/X86/mir-canon-hash-bb.mir
new file mode 100644
index 0000000000000..94c69f1be36a6
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/X86/mir-canon-hash-bb.mir
@@ -0,0 +1,61 @@
+# RUN: llc  -run-pass mir-namer -x mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc  -run-pass mir-canonicalizer -x mir -verify-machineinstrs %s -o - | FileCheck %s
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  define i32 @_Z1fi(i32 %arg) {
+    %tmp = alloca i32, align 4
+    %tmp1 = alloca i32, align 4
+    ret i32 %arg
+  }
+
+...
+---
+name:            _Z1fi
+registers:
+  - { id: 0, class: _, preferred-register: '' }
+  - { id: 1, class: _, preferred-register: '' }
+  - { id: 2, class: _, preferred-register: '' }
+  - { id: 3, class: _, preferred-register: '' }
+  - { id: 4, class: _, preferred-register: '' }
+  - { id: 5, class: _, preferred-register: '' }
+  - { id: 6, class: _, preferred-register: '' }
+  - { id: 7, class: _, preferred-register: '' }
+  - { id: 8, class: _, preferred-register: '' }
+stack:
+  - { id: 0, name: tmp, type: default, offset: 0, size: 4, alignment: 4  }
+  - { id: 1, name: tmp1, type: default, offset: 0, size: 4, alignment: 4 }
+body:             |
+  bb.0:
+    %tmp0:_(s32) = COPY $edi
+    %tmp1:_(s32) = G_CONSTANT i32 0
+    %tmp5:_(p0) = G_FRAME_INDEX %stack.0.tmp
+    %tmp6:_(p0) = G_FRAME_INDEX %stack.1.tmp1
+    G_STORE %tmp0(s32), %tmp5(p0) :: (store 4 into %ir.tmp)
+    %tmp7:_(s32) = G_LOAD %tmp5(p0) :: (load 4 from %ir.tmp)
+    %tmp8:_(s1) = G_ICMP intpred(ne), %tmp7(s32), %tmp1
+    G_BRCOND %tmp8(s1), %bb.1
+    G_BR %bb.2
+
+  ; CHECK: bb.1:
+  ; CHECK: %bb2_{{[0-9]+}}__1:_(s32) = G_CONSTANT
+  bb.1:
+    %tmp4:_(s32) = G_CONSTANT i32 1
+    G_STORE %tmp4(s32), %tmp6(p0) :: (store 4 into %ir.tmp1)
+    G_BR %bb.3
+
+
+  ; CHECK: bb.2:
+  ; CHECK: %bb1_{{[0-9]+}}__1:_(s32) = G_CONSTANT
+  bb.2:
+    %tmp3:_(s32) = G_CONSTANT i32 2
+    G_STORE %tmp3(s32), %tmp6(p0) :: (store 4 into %ir.tmp1)
+
+  ; CHECK: bb.3:
+  ; CHECK: %bb3_{{[0-9]+}}__1:_(s32) =  G_LOAD
+  bb.3:
+    %tmp9:_(s32) = G_LOAD %tmp6(p0) :: (load 4 from %ir.tmp1)
+    $eax = COPY %tmp9(s32)
+    RET 0, implicit $eax
+
+...
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
index 5a805af18ded3..e30870b73635d 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
@@ -131,10 +131,9 @@ tracksRegLiveness: true
 body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: i1_true
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; MIPS32: $v0 = COPY [[AND]](s32)
     ; MIPS32: RetRA implicit $v0
     %0:_(s1) = G_CONSTANT i1 true
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
index 59d4280e1ba88..057abae4d8198 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
@@ -150,14 +150,13 @@ body:             |
     ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
     ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
     ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
     ; MIPS32: [[COPY4:%[0-9]+]]:_(s32) = COPY [[ICMP]](s32)
     ; MIPS32: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY4]], [[COPY5]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; MIPS32: [[COPY6:%[0-9]+]]:_(s32) = COPY [[XOR]](s32)
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]]
+    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C]]
     ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY2]], [[COPY3]]
     ; MIPS32: $v0 = COPY [[SELECT]](s32)
     ; MIPS32: RetRA implicit $v0
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
index bdafe26491a34..0187b72780d7f 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/constants.ll
@@ -71,7 +71,7 @@ entry:
 define zeroext i1 @i1_true() {
 ; MIPS32-LABEL: i1_true:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    addiu $1, $zero, 65535
+; MIPS32-NEXT:    ori $1, $zero, 1
 ; MIPS32-NEXT:    andi $2, $1, 1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fcmp.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fcmp.ll
index 8f559633c9569..58d5c8a160a6b 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fcmp.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fcmp.ll
@@ -15,7 +15,7 @@ entry:
 define i1 @true_s(float %x, float %y) {
 ; MIPS32-LABEL: true_s:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    addiu $2, $zero, 65535
+; MIPS32-NEXT:    ori $2, $zero, 1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -233,7 +233,7 @@ entry:
 define i1 @true_d(double %x, double %y) {
 ; MIPS32-LABEL: true_d:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    addiu $2, $zero, 65535
+; MIPS32-NEXT:    ori $2, $zero, 1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
index c127d1208919f..71c3023ca153f 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
@@ -56,8 +56,9 @@ entry:
 define i32 @select_with_negation(i32 %a, i32 %b, i32 %x, i32 %y) {
 ; MIPS32-LABEL: select_with_negation:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    slt $1, $4, $5
-; MIPS32-NEXT:    not $1, $1
+; MIPS32-NEXT:    ori $1, $zero, 1
+; MIPS32-NEXT:    slt $2, $4, $5
+; MIPS32-NEXT:    xor $1, $2, $1
 ; MIPS32-NEXT:    andi $1, $1, 1
 ; MIPS32-NEXT:    movn $7, $6, $1
 ; MIPS32-NEXT:    move $2, $7
diff --git a/llvm/test/CodeGen/Mips/delay-slot-filler-bundled-insts.mir b/llvm/test/CodeGen/Mips/delay-slot-filler-bundled-insts.mir
new file mode 100644
index 0000000000000..1539bb5f73e91
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/delay-slot-filler-bundled-insts.mir
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+## Check that the delay-slot filler does not attempt to split BUNDLE instructions
+# RUN: llc %s -start-before=mips-delay-slot-filler -stop-after=mips-delay-slot-filler \
+# RUN:   -verify-machineinstrs -o - | FileCheck %s
+## Check that we can emit assembly for input with BUNDLE instructions:
+# RUN: llc %s -start-before=mips-delay-slot-filler -verify-machineinstrs -o - | FileCheck %s -check-prefix ASM
+
+# ASM:  # %bb.0:
+# ASM-NEXT: daddiu	$sp, $sp, -16
+# ASM-NEXT: sd	$ra, 8($sp)
+## BUNDLE should be emitted in order:
+# ASM-NEXT: daddiu	$sp, $sp, -16
+# ASM-NEXT: daddiu	$sp, $sp, 16
+# ASM-NEXT: beqz	$4, .LBB0_2
+# ASM-NEXT: nop
+--- |
+  target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-n32:64-S128"
+  target triple = "mips64-unknown-freebsd"
+  declare i8* @func_a(i64 zeroext)
+  declare i8* @func_b(i64 zeroext)
+  ; Function Attrs: nounwind
+  define i8* @test(i64 zeroext %nbytes) local_unnamed_addr #0 {
+  entry:
+    %cmp = icmp eq i64 %nbytes, 0
+    br i1 %cmp, label %if.else, label %if.then
+
+  if.then:                                          ; preds = %entry
+    %call = tail call i8* @func_a(i64 zeroext %nbytes)
+    br label %return
+
+  if.else:                                          ; preds = %entry
+    %call1 = tail call i8* @func_b(i64 zeroext 0)
+    br label %return
+
+  return:                                           ; preds = %if.else, %if.then
+    %retval.0 = phi i8* [ %call, %if.then ], [ %call1, %if.else ]
+    ret i8* %retval.0
+  }
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+
+  attributes #0 = { nounwind }
+
+...
+---
+name:            test
+alignment:       8
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$a0_64', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       16
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '$ra_64', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.2(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   $sp_64 = DADDiu $sp_64, -16
+  ; CHECK:   CFI_INSTRUCTION def_cfa_offset 16
+  ; CHECK:   SD killed $ra_64, $sp_64, 8 :: (store 8 into %stack.0)
+  ; CHECK:   CFI_INSTRUCTION offset $ra_64, -8
+  ; CHECK:   BUNDLE {
+  ; CHECK:     $sp_64 = DADDiu $sp_64, -16
+  ; CHECK:     $sp_64 = DADDiu $sp_64, 16
+  ; CHECK:   }
+  ; CHECK:   BEQ64 renamable $a0_64, $zero_64, %bb.2, implicit-def $at {
+  ; CHECK:     NOP
+  ; CHECK:   }
+  ; CHECK: bb.1.if.then:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   JAL @func_a, csr_n64, implicit-def dead $ra, implicit $a0_64, implicit-def $sp, implicit-def $v0_64 {
+  ; CHECK:     NOP
+  ; CHECK:   }
+  ; CHECK:   J %bb.3, implicit-def dead $at {
+  ; CHECK:     NOP
+  ; CHECK:   }
+  ; CHECK: bb.2.if.else:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   JAL @func_b, csr_n64, implicit-def dead $ra, implicit $a0_64, implicit-def $sp, implicit-def $v0_64 {
+  ; CHECK:     $a0_64 = DADDiu $zero_64, 0
+  ; CHECK:   }
+  ; CHECK: bb.3.return:
+  ; CHECK:   $ra_64 = LD $sp_64, 8 :: (load 8 from %stack.0)
+  ; CHECK:   PseudoReturn64 undef $ra_64, implicit $v0_64 {
+  ; CHECK:     $sp_64 = DADDiu $sp_64, 16
+  ; CHECK:   }
+  bb.0.entry:
+    successors: %bb.2(0x30000000), %bb.1(0x50000000)
+    liveins: $a0_64, $ra_64
+
+    $sp_64 = DADDiu $sp_64, -16
+    CFI_INSTRUCTION def_cfa_offset 16
+    SD killed $ra_64, $sp_64, 8 :: (store 8 into %stack.0)
+    CFI_INSTRUCTION offset $ra_64, -8
+    ; This BUNDLE instruction must not be split by the delay slot filler:
+    BUNDLE {
+      $sp_64 = DADDiu $sp_64, -16
+      $sp_64 = DADDiu $sp_64, 16
+    }
+    BEQ64 renamable $a0_64, $zero_64, %bb.2, implicit-def $at
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $a0_64
+
+    JAL @func_a, csr_n64, implicit-def dead $ra, implicit $a0_64, implicit-def $sp, implicit-def $v0_64
+    J %bb.3, implicit-def dead $at
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+
+    $a0_64 = DADDiu $zero_64, 0
+    JAL @func_b, csr_n64, implicit-def dead $ra, implicit $a0_64, implicit-def $sp, implicit-def $v0_64
+
+  bb.3.return:
+    liveins: $v0_64
+
+    $ra_64 = LD $sp_64, 8 :: (load 8 from %stack.0)
+    $sp_64 = DADDiu $sp_64, 16
+    PseudoReturn64 undef $ra_64, implicit $v0_64
+
+...
diff --git a/llvm/test/CodeGen/PowerPC/addi-licm.ll b/llvm/test/CodeGen/PowerPC/addi-licm.ll
index e0314d19bd3f1..24c9805f1343d 100644
--- a/llvm/test/CodeGen/PowerPC/addi-licm.ll
+++ b/llvm/test/CodeGen/PowerPC/addi-licm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mcpu=pwr7 -disable-ppc-preinc-prep < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -disable-ppc-instr-form-prep < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s -check-prefix=PIP
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/PowerPC/aix-func-dsc-gen.ll b/llvm/test/CodeGen/PowerPC/aix-func-dsc-gen.ll
index 217f687e8b086..29b2d4c454c4e 100644
--- a/llvm/test/CodeGen/PowerPC/aix-func-dsc-gen.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-func-dsc-gen.ll
@@ -103,7 +103,7 @@ entry:
 ; CHECK-NEXT:       SectionLen: 0
 ; CHECK-NEXT:       ParameterHashIndex: 0x0
 ; CHECK-NEXT:       TypeChkSectNum: 0x0
-; CHECK-NEXT:       SymbolAlignmentLog2: 0
+; CHECK-NEXT:       SymbolAlignmentLog2: 2
 ; CHECK-NEXT:       SymbolType: XTY_SD (0x1)
 ; CHECK-NEXT:       StorageMappingClass: XMC_TC0 (0xF)
 ; CHECK-NEXT:       StabInfoIndex: 0x0
diff --git a/llvm/test/CodeGen/PowerPC/aix-lower-block-address.ll b/llvm/test/CodeGen/PowerPC/aix-lower-block-address.ll
index 2d6353876a331..b4b9f029ed0bb 100644
--- a/llvm/test/CodeGen/PowerPC/aix-lower-block-address.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-lower-block-address.ll
@@ -69,4 +69,4 @@ __here:
 ; 64LARGE-ASM:         ld [[REG2:[0-9]+]], LC0@l([[REG1]])
 
 ; CHECK: .toc
-; CHECK-NOT: .tc
+; CHECK: .tc Ltmp0[TC],Ltmp0
diff --git a/llvm/test/CodeGen/PowerPC/aix-lower-constant-pool-index.ll b/llvm/test/CodeGen/PowerPC/aix-lower-constant-pool-index.ll
index 8803a1e4569fb..1db8a55fb28e5 100644
--- a/llvm/test/CodeGen/PowerPC/aix-lower-constant-pool-index.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-lower-constant-pool-index.ll
@@ -84,4 +84,4 @@ entry:
 ; 64LARGE-ASM:         blr
 
 ; CHECK: .toc
-; CHECK-NOT: .tc
+; CHECK: .tc .LCPI0_0[TC],.LCPI0_0
diff --git a/llvm/test/CodeGen/PowerPC/aix-lower-jump-table.ll b/llvm/test/CodeGen/PowerPC/aix-lower-jump-table.ll
index 5efb956b1529d..a5ec1942a3157 100644
--- a/llvm/test/CodeGen/PowerPC/aix-lower-jump-table.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-lower-jump-table.ll
@@ -185,4 +185,4 @@
 ; 64LARGE-ASM:      .long   LBB0_5-.LJTI0_0
 
 ; CHECK: .toc
-; CHECK-NOT: .tc
+; CHECK: .tc .LJTI0_0[TC],.LJTI0_0
diff --git a/llvm/test/CodeGen/PowerPC/aix-weak-undef-func-call.ll b/llvm/test/CodeGen/PowerPC/aix-weak-undef-func-call.ll
new file mode 100644
index 0000000000000..9fb3dec19edf2
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-weak-undef-func-call.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -filetype=obj -o %t.o < %s
+; RUN: llvm-readobj  --symbols %t.o | FileCheck %s
+
+define void @bar() {
+entry:
+  call void bitcast (void (...)* @foo to void ()*)()
+  ret void
+}
+
+declare extern_weak void @foo(...) 
+
+;CHECK: Symbol {
+;CHECK:   Name: .foo
+;CHECK-NEXT:   Value (RelocatableAddress): 0x0
+;CHECK-NEXT:   Section: N_UNDEF
+;CHECK-NEXT:   Type: 0x0
+;CHECK-NEXT:   StorageClass: C_WEAKEXT (0x6F)
+;CHECK-NEXT:   NumberOfAuxEntries: 1
+;CHECK-NEXT:   CSECT Auxiliary Entry {
+;CHECK:          SectionLen: 0
+;CHECK-NEXT:     ParameterHashIndex: 0x0
+;CHECK-NEXT:     TypeChkSectNum: 0x0
+;CHECK-NEXT:     SymbolAlignmentLog2: 0
+;CHECK-NEXT:     SymbolType: XTY_ER (0x0)
+;CHECK-NEXT:     StorageMappingClass: XMC_PR (0x0)
+;CHECK-NEXT:     StabInfoIndex: 0x0
+;CHECK-NEXT:     StabSectNum: 0x0
+;CHECK-NEXT:   }
+;CHECK-NEXT: }
+
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-data-only-notoc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-data-only-notoc.ll
new file mode 100644
index 0000000000000..d6e772ffc928e
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-data-only-notoc.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s
+
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -filetype=obj -o %t.o < %s
+; RUN: llvm-readobj --syms %t.o | FileCheck --check-prefix=SYMS %s
+
+; RUN: not llc -mtriple powerpc64-ibm-aix-xcoff -filetype=obj < %s 2>&1 | \
+; RUN: FileCheck --check-prefix=OBJ64 %s
+; OBJ64: LLVM ERROR: 64-bit XCOFF object files are not supported yet.
+
+@a = external global i32, align 4
+@b = external global i64, align 8
+@c = external global i16, align 2
+@globa = common global i32 0, align 4
+
+@ptr = internal global void (...)* null, align 4
+
+; CHECK-NOT: .toc
+; SYMS-NOT: Name: TOC
diff --git a/llvm/test/CodeGen/PowerPC/aix-xcoff-toc.ll b/llvm/test/CodeGen/PowerPC/aix-xcoff-toc.ll
new file mode 100644
index 0000000000000..57f97064b5c9a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-xcoff-toc.ll
@@ -0,0 +1,214 @@
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck --check-prefixes CHECK,CHECK32 %s
+; RUN: llc  -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck --check-prefixes CHECK,CHECK64  %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc-ibm-aix-xcoff -filetype=obj -o %t.o < %s
+; RUN: llvm-readobj --syms %t.o | FileCheck --check-prefix=SYM %s
+
+; RUN: not llc -verify-machineinstrs -mcpu=pwr7 -mtriple powerpc64-ibm-aix-xcoff -filetype=obj -o %t.o 2>&1 \
+; RUN: < %s | FileCheck --check-prefix=XCOFF64 %s
+; XCOFF64: LLVM ERROR: 64-bit XCOFF object files are not supported yet.
+
+@a = external global i32, align 4
+@b = external global i64, align 8
+@c = external global i16, align 2
+@globa = common global i32 0, align 4
+
+@ptr = internal global void (...)* null, align 4
+
+declare void @foo()
+
+define void @bar() {
+  %1 = alloca i8*, align 8
+  store i32 0, i32* @a, align 4
+  store i64 0, i64* @b, align 8
+  store i16 0, i16* @c, align 2
+  store i32 0, i32* @globa, align 4
+  store void (...)* bitcast (void ()* @bar to void (...)*), void (...)** @ptr, align 4
+  store i8* bitcast (void ()* @foo to i8*), i8** %1, align 8
+  ret void
+}
+
+; CHECK-NOT: .comm a
+; CHECK-NOT: .lcomm a
+; CHECK-NOT: .comm b
+; CHECK-NOT: .lcomm b
+; CHECK-NOT: .comm c
+; CHECK-NOT: .lcomm c
+; CHECK: .comm globa[RW],4,2
+; CHECK32: .lcomm ptr,4,ptr[BS],2
+; CHECK64: .lcomm ptr,8,ptr[BS],2
+; CHECK:      .toc
+; CHECK-NEXT: LC0:
+; CHECK-NEXT: .tc   a[TC],a[UA]
+; CHECK-NEXT: LC1:
+; CHECK-NEXT: .tc   b[TC],b[UA]
+; CHECK-NEXT: LC2:
+; CHECK-NEXT: .tc   c[TC],c[UA]
+; CHECK-NEXT: LC3:
+; CHECK-NEXT: .tc   globa[TC],globa[RW]
+; CHECK-NEXT: LC4:
+; CHECK-NEXT: .tc   ptr[TC],ptr[BS]
+; CHECK-NEXT: LC5:
+; CHECK-NEXT: .tc   bar[TC],bar[DS]
+; CHECK-NEXT: LC6:
+; CHECK-NEXT: .tc   foo[TC],foo[DS]
+
+; SYM:       File: {{.*}}aix-xcoff-toc.ll.tmp.o
+; SYM:       Symbol {{[{][[:space:]] *}}Index: [[#INDX:]]{{[[:space:]] *}}Name: TOC
+; SYM-NEXT:    Value (RelocatableAddress): 0x54
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+1]]
+; SYM-NEXT:      SectionLen: 0
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC0 (0xF)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+2]]
+; SYM-NEXT:    Name: a
+; SYM-NEXT:    Value (RelocatableAddress): 0x54
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+3]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+4]]
+; SYM-NEXT:    Name: b
+; SYM-NEXT:    Value (RelocatableAddress): 0x58
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+5]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+6]]
+; SYM-NEXT:    Name: c
+; SYM-NEXT:    Value (RelocatableAddress): 0x5C
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+7]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+8]]
+; SYM-NEXT:    Name: globa
+; SYM-NEXT:    Value (RelocatableAddress): 0x60
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+9]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+10]]
+; SYM-NEXT:    Name: ptr
+; SYM-NEXT:    Value (RelocatableAddress): 0x64
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+11]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+12]]
+; SYM-NEXT:    Name: bar
+; SYM-NEXT:    Value (RelocatableAddress): 0x68
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+13]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
+; SYM-NEXT:  Symbol {
+; SYM-NEXT:    Index: [[#INDX+14]]
+; SYM-NEXT:    Name: foo
+; SYM-NEXT:    Value (RelocatableAddress): 0x6C
+; SYM-NEXT:    Section: .data
+; SYM-NEXT:    Type: 0x0
+; SYM-NEXT:    StorageClass: C_HIDEXT (0x6B)
+; SYM-NEXT:    NumberOfAuxEntries: 1
+; SYM-NEXT:    CSECT Auxiliary Entry {
+; SYM-NEXT:      Index: [[#INDX+15]]
+; SYM-NEXT:      SectionLen: 4
+; SYM-NEXT:      ParameterHashIndex: 0x0
+; SYM-NEXT:      TypeChkSectNum: 0x0
+; SYM-NEXT:      SymbolAlignmentLog2: 2
+; SYM-NEXT:      SymbolType: XTY_SD (0x1)
+; SYM-NEXT:      StorageMappingClass: XMC_TC (0x3)
+; SYM-NEXT:      StabInfoIndex: 0x0
+; SYM-NEXT:      StabSectNum: 0x0
+; SYM-NEXT:    }
+; SYM-NEXT:  }
diff --git a/llvm/test/CodeGen/PowerPC/aix_cc_abi.ll b/llvm/test/CodeGen/PowerPC/aix_cc_abi.ll
new file mode 100644
index 0000000000000..b15b63b166f70
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix_cc_abi.ll
@@ -0,0 +1,614 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=CHECK,32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefixes=CHECK,64BIT %s
+
+define void @call_test_chars() {
+entry:
+  call i8 @test_chars(i8 signext 97, i8 signext 97, i8 signext 97, i8 signext 97)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_chars
+
+; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT: $r3 = LI 97
+; 32BIT: $r4 = LI 97
+; 32BIT: $r5 = LI 97
+; 32BIT: $r6 = LI 97
+; 32BIT: BL_NOP <mcsymbol .test_chars>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
+; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: $x3 = LI8 97
+; 64BIT: $x4 = LI8 97
+; 64BIT: $x5 = LI8 97
+; 64BIT: $x6 = LI8 97
+; 64BIT: BL8_NOP <mcsymbol .test_chars>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define signext i8 @test_chars(i8 signext %c1, i8 signext %c2, i8 signext %c3, i8 signext %c4) {
+entry:
+  %conv = sext i8 %c1 to i32
+  %conv1 = sext i8 %c2 to i32
+  %add = add nsw i32 %conv, %conv1
+  %conv2 = sext i8 %c3 to i32
+  %add3 = add nsw i32 %add, %conv2
+  %conv4 = sext i8 %c4 to i32
+  %add5 = add nsw i32 %add3, %conv4
+  %conv6 = trunc i32 %add5 to i8
+  ret i8 %conv6
+}
+
+; CHECK-LABEL: name: test_chars
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
+; 32BIT:       body:
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6
+ 
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
+; 64BIT:       body:
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
+
+define void @call_test_chars_mix() {
+entry:
+  call i8 @test_chars_mix(i8 signext 97, i8 zeroext -31, i8 zeroext 97, i8 signext -31)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_chars_mix
+
+; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT: $r3 = LI 97
+; 32BIT: $r4 = LI 225
+; 32BIT: $r5 = LI 97
+; 32BIT: $r6 = LI -31
+; 32BIT: BL_NOP <mcsymbol .test_chars_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
+; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: $x3 = LI8 97
+; 64BIT: $x4 = LI8 225
+; 64BIT: $x5 = LI8 97
+; 64BIT: $x6 = LI8 -31
+; 64BIT: BL8_NOP <mcsymbol .test_chars_mix>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define signext i8 @test_chars_mix(i8 signext %c1, i8 zeroext %c2, i8 zeroext %c3, i8 signext %c4) {
+entry:
+  %conv = sext i8 %c1 to i32
+  %conv1 = zext i8 %c2 to i32
+  %add = add nsw i32 %conv, %conv1
+  %conv2 = zext i8 %c3 to i32
+  %add3 = add nsw i32 %add, %conv2
+  %conv4 = sext i8 %c4 to i32
+  %add5 = add nsw i32 %add3, %conv4
+  %conv6 = trunc i32 %add5 to i8
+  ret i8 %conv6
+}
+
+; CHECK-LABEL: name: test_chars_mix
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
+; 32BIT:       body:
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6
+ 
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
+; 64BIT:       body:
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
+
+@global_i1 = global i8 0, align 1
+
+define  void @test_i1(i1 %b)  {
+  entry:
+   %frombool = zext i1 %b to i8
+   store i8 %frombool, i8* @global_i1, align 1
+   ret void
+}
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT:       body:             |
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3
+; 32BIT:           renamable $r3 = RLWINM killed renamable $r3, 0, 31, 31
+; 32BIT-NEXT:      STB killed renamable $r3, 0, killed renamable $r4 :: (store 1 into @global_i1)
+
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT:       body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3
+; 64BIT:           renamable $r[[REG1:[0-9]+]] = RLWINM renamable $r[[REG1]], 0, 31, 31, implicit killed $x3
+; 64BIT-NEXT:      STB killed renamable $r[[REG1]], 0, killed renamable $x4 :: (store 1 into @global_i1)
+
+define void @call_test_i1() {
+entry:
+  call void @test_i1(i1 1)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_i1
+
+; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT: $r3 = LI 1
+; 32BIT: BL_NOP <mcsymbol .test_i1>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
+; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: $x3 = LI8 1
+; 64BIT: BL8_NOP <mcsymbol .test_i1>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define void @test_i1zext(i1 zeroext %b) {
+  entry:
+    %frombool = zext i1 %b to i8 
+    store i8 %frombool, i8 * @global_i1, align 1 
+    ret void 
+  }
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT:       body:             |
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3
+; CHECK-NOT:       RLWINM 
+; 32BIT:           STB killed renamable $r3, 0, killed renamable $r4 :: (store 1 into @global_i1)
+
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT:       body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3
+; CHECK-NOT:       RLWINM
+; 64BIT:           STB8 killed renamable $x3, 0, killed renamable $x4 :: (store 1 into @global_i1)
+
+define i32 @test_ints(i32 signext %a, i32 zeroext %b, i32 zeroext %c, i32 signext %d, i32 signext %e, i32 signext %f, i32 signext %g, i32 signext %h) {
+entry:
+    %add = add i32 %a, %b
+    %add1 = add i32 %add, %c
+    %add2 = add i32 %add1, %d
+    %add3 = add i32 %add2, %e
+    %add4 = add i32 %add3, %f
+    %add5 = add i32 %add4, %g
+    %add6 = add i32 %add5, %h
+    ret i32 %add6
+}
+
+; CHECK-LABEL: name: test_ints
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r7', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r8', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r9', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r10', virtual-reg: '' }
+; 32BIT:       body:             |
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x7', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x8', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x9', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x10', virtual-reg: '' }
+; 64BIT:       body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10
+
+define void @call_test_ints() {
+entry:
+  call i32 @test_ints(i32 signext 1, i32 zeroext 1, i32 zeroext 2147483648, i32 signext -2147483648, i32 signext 1, i32 signext 1, i32 signext 1, i32 signext 1)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_ints
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: renamable $x3 = LI8 1
+; 64BIT: renamable $x5 = RLDICR killed renamable $x3, 31, 32
+; 64BIT: $x3 = LI8 1
+; 64BIT: $x4 = LI8 1
+; 64BIT: $x6 = LIS8 32768
+; 64BIT: $x7 = LI8 1
+; 64BIT: $x8 = LI8 1
+; 64BIT: $x9 = LI8 1
+; 64BIT: $x10 = LI8 1
+; 64BIT:  BL8_NOP <mcsymbol .test_ints>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $x2, implicit-def $r1, implicit-def dead $x3
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define void @call_test_i64() {
+entry:
+  call i64 @test_i64(i64 1, i64 2, i64 3, i64 4)
+  ret void
+}
+
+
+; CHECK-LABEL: name: call_test_i64
+
+; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT: $r3 = LI 0
+; 32BIT: $r4 = LI 1
+; 32BIT: $r5 = LI 0
+; 32BIT: $r6 = LI 2
+; 32BIT: $r7 = LI 0
+; 32BIT: $r8 = LI 3
+; 32BIT: $r9 = LI 0
+; 32BIT: $r10 = LI 4
+; 32BIT: BL_NOP <mcsymbol .test_i64>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $r2, implicit-def $r1
+; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: $x3 = LI8 1
+; 64BIT: $x4 = LI8 2
+; 64BIT: $x5 = LI8 3
+; 64BIT: $x6 = LI8 4
+; 64BIT: BL8_NOP <mcsymbol .test_i64>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define i64 @test_i64(i64 %a, i64 %b, i64 %c, i64 %d) {
+entry:
+  %add = add nsw i64 %a, %b
+  %add1 = add nsw i64 %add, %c
+  %add2 = add nsw i64 %add1, %d
+  ret i64 %add2
+}
+
+; CHECK-LABEL: name: test_i64
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r5', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r6', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r7', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r8', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r9', virtual-reg: '' }
+; 32BIT-NEXT:  - { reg: '$r10', virtual-reg: '' }
+; 32BIT:       body:             |
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10
+
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x5', virtual-reg: '' }
+; 64BIT-NEXT:  - { reg: '$x6', virtual-reg: '' }
+; 64BIT:       body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3, $x4, $x5, $x6
+
+define void @call_test_int_ptr() {
+entry:
+  %b = alloca i32, align 4
+  store i32 0, i32* %b, align 4
+  call void @test_int_ptr(i32* %b)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_int_ptr
+
+; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT: renamable $r3 = ADDI %stack.0.b, 0
+; 32BIT: BL_NOP <mcsymbol .test_int_ptr>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
+; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT: renamable $x3 = ADDI8 %stack.0.b, 0
+; 64BIT: BL8_NOP <mcsymbol .test_int_ptr>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
+; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define void @test_int_ptr(i32* %a) {
+entry:
+  %a.addr = alloca i32*, align 8
+  store i32* %a, i32** %a.addr, align 8
+  ret void
+}
+
+; CHECK-LABEL: name: test_int_ptr
+
+; 32BIT:       liveins:
+; 32BIT-NEXT:  - { reg: '$r3', virtual-reg: '' }
+; 32BIT:       body:             |
+; 32BIT-NEXT:    bb.0.entry:
+; 32BIT-NEXT:      liveins: $r3
+; 32BIT:           STW killed renamable $r3, 0, %stack.0.a.addr :: (store 4 into %ir.a.addr, align 8)
+
+; 64BIT:       liveins:
+; 64BIT-NEXT:  - { reg: '$x3', virtual-reg: '' }
+; 64BIT:       body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:      liveins: $x3
+; 64BIT:           STD killed renamable $x3, 0, %stack.0.a.addr :: (store 8 into %ir.a.addr)
+
+
+define i32 @caller(i32 %i)  {
+entry:
+  %i.addr = alloca i32, align 4
+  %b = alloca i8, align 1
+  store i32 %i, i32* %i.addr, align 4
+  %0 = load i32, i32* %i.addr, align 4
+  %cmp = icmp ne i32 %0, 0
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* %b, align 1
+  %1 = load i8, i8* %b, align 1
+  %tobool = trunc i8 %1 to i1
+  %call = call i32 @call_test_bool(i1 zeroext %tobool)
+  ret i32 %call
+}
+
+declare i32 @call_test_bool(i1 zeroext)
+  
+; CHECK-LABEL: name:            caller
+
+; 32BIT:        liveins:
+; 32BIT-NEXT:   - { reg: '$r3', virtual-reg: '' }
+; 32BIT:        body:             |
+; 32BIT-NEXT:   bb.0.entry:
+; 32BIT:         liveins: $r3
+; 32BIT:          ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT:          BL_NOP <mcsymbol .call_test_bool>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1, implicit-def $r3
+; 32BIT:          ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT:        liveins:
+; 64BIT-NEXT:   - { reg: '$x3', virtual-reg: '' }
+; 64BIT:        body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:     liveins: $x3
+; 64BIT:          ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT:          BL8_NOP <mcsymbol .call_test_bool>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1, implicit-def $x3
+; 64BIT:          ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+@f1 = global float 0.000000e+00, align 4
+@d1 = global double 0.000000e+00, align 8
+
+define void @call_test_floats() {
+entry:
+  %0 = load float, float* @f1, align 4
+  call float @test_floats(float %0, float %0, float %0)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_floats{{.*}}
+
+; 32BIT:      renamable $r3 = LWZtoc @f1, $r2 :: (load 4 from got)
+; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load 4 from @f1)
+; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-NEXT: $f2 = COPY renamable $f1
+; 32BIT-NEXT: $f3 = COPY renamable $f1
+; 32BIT-NEXT: BL_NOP <mcsymbol .test_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $r2, implicit-def $r1
+; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT:      renamable $x3 = LDtoc @f1, $x2 :: (load 8 from got)
+; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load 4 from @f1)
+; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-NEXT: $f2 = COPY renamable $f1
+; 64BIT-NEXT: $f3 = COPY renamable $f1
+; 64BIT-NEXT: BL8_NOP <mcsymbol .test_floats>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $x2, implicit-def $r1
+; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define float @test_floats(float %f1, float %f2, float %f3) {
+entry:
+  %add = fadd float %f1, %f2
+  %add1 = fadd float %add, %f3
+  ret float %add1
+}
+
+; CHECK-LABEL: name: test_floats{{.*}}
+
+; CHECK:      liveins:
+; CHECK-NEXT: - { reg: '$f1', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f2', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f3', virtual-reg: '' }
+; CHECK:      body:             |
+; CHECK-NEXT:   bb.0.entry:
+; CHECK-NEXT:     liveins: $f1, $f2, $f3
+
+define void @call_test_fpr_max() {
+entry:
+  %0 = load double, double* @d1, align 8
+  call double @test_fpr_max(double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_fpr_max{{.*}}
+
+; 32BIT:      renamable $r3 = LWZtoc @d1, $r2 :: (load 4 from got)
+; 32BIT-NEXT: renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load 8 from @d1)
+; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-NEXT: $f2 = COPY renamable $f1
+; 32BIT-NEXT: $f3 = COPY renamable $f1
+; 32BIT-NEXT: $f4 = COPY renamable $f1
+; 32BIT-NEXT: $f5 = COPY renamable $f1
+; 32BIT-NEXT: $f6 = COPY renamable $f1
+; 32BIT-NEXT: $f7 = COPY renamable $f1
+; 32BIT-NEXT: $f8 = COPY renamable $f1
+; 32BIT-NEXT: $f9 = COPY renamable $f1
+; 32BIT-NEXT: $f10 = COPY renamable $f1
+; 32BIT-NEXT: $f11 = COPY renamable $f1
+; 32BIT-NEXT: $f12 = COPY renamable $f1
+; 32BIT-NEXT: $f13 = COPY renamable $f1
+; 32BIT-NEXT: BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1
+; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT:      renamable $x3 = LDtoc @d1, $x2 :: (load 8 from got)
+; 64BIT-NEXT: renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load 8 from @d1)
+; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-NEXT: $f2 = COPY renamable $f1
+; 64BIT-NEXT: $f3 = COPY renamable $f1
+; 64BIT-NEXT: $f4 = COPY renamable $f1
+; 64BIT-NEXT: $f5 = COPY renamable $f1
+; 64BIT-NEXT: $f6 = COPY renamable $f1
+; 64BIT-NEXT: $f7 = COPY renamable $f1
+; 64BIT-NEXT: $f8 = COPY renamable $f1
+; 64BIT-NEXT: $f9 = COPY renamable $f1
+; 64BIT-NEXT: $f10 = COPY renamable $f1
+; 64BIT-NEXT: $f11 = COPY renamable $f1
+; 64BIT-NEXT: $f12 = COPY renamable $f1
+; 64BIT-NEXT: $f13 = COPY renamable $f1
+; 64BIT-NEXT: BL8_NOP <mcsymbol .test_fpr_max>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1
+; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define double @test_fpr_max(double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, double %d8, double %d9, double %d10, double %d11, double %d12, double %d13) {
+entry:
+  %add = fadd double %d1, %d2
+  %add1 = fadd double %add, %d3
+  %add2 = fadd double %add1, %d4
+  %add3 = fadd double %add2, %d5
+  %add4 = fadd double %add3, %d6
+  %add5 = fadd double %add4, %d7
+  %add6 = fadd double %add5, %d8
+  %add7 = fadd double %add6, %d9
+  %add8 = fadd double %add7, %d10
+  %add9 = fadd double %add8, %d11
+  %add10 = fadd double %add9, %d12
+  %add11 = fadd double %add10, %d13
+  ret double %add11
+}
+
+; CHECK-LABEL: name: test_fpr_max{{.*}}
+
+; CHECK:      liveins:
+; CHECK-NEXT: - { reg: '$f1', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f2', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f3', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f4', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f5', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f6', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f7', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f8', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f9', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f10', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f11', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f12', virtual-reg: '' }
+; CHECK-NEXT: - { reg: '$f13', virtual-reg: '' }
+; CHECK:      body:             |
+; CHECK-NEXT:   bb.0.entry:
+; CHECK-NEXT:     liveins: $f1, $f2, $f3, $f4, $f5, $f6, $f7, $f8, $f9, $f10, $f11, $f12, $f13
+
+define void @call_test_mix() {
+entry:
+  %0 = load float, float* @f1, align 4
+  %1 = load double, double* @d1, align 8
+  call i32 @test_mix(float %0, i32 1, double %1, i8 signext 97)
+  ret void
+}
+
+; CHECK-LABEL: name: call_test_mix{{.*}}
+
+; 32BIT:      renamable $r[[REG1:[0-9]+]] = LWZtoc @f1, $r2 :: (load 4 from got)
+; 32BIT-NEXT: renamable $r[[REG2:[0-9]+]] = LWZtoc @d1, $r2 :: (load 4 from got)
+; 32BIT-NEXT: renamable $f1 = LFS 0, killed renamable $r[[REG1]] :: (dereferenceable load 4 from @f1)
+; 32BIT-NEXT: renamable $f2 = LFD 0, killed renamable $r[[REG2]] :: (dereferenceable load 8 from @d1)
+; 32BIT-NEXT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
+; 32BIT-NEXT: $r4 = LI 1
+; 32BIT-NEXT: $r7 = LI 97
+; 32BIT-NEXT: BL_NOP <mcsymbol .test_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r4, implicit $f2, implicit killed $r7, implicit $r2, implicit-def $r1
+; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
+
+; 64BIT:      renamable $x[[REG1:[0-9]+]] = LDtoc @f1, $x2 :: (load 8 from got)
+; 64BIT-NEXT: renamable $x[[REG2:[0-9]+]] = LDtoc @d1, $x2 :: (load 8 from got)
+; 64BIT-NEXT: renamable $f1 = LFS 0, killed renamable $x[[REG1]] :: (dereferenceable load 4 from @f1)
+; 64BIT-NEXT: renamable $f2 = LFD 0, killed renamable $x[[REG2]] :: (dereferenceable load 8 from @d1)
+; 64BIT-NEXT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+; 64BIT-NEXT: $x4 = LI8 1
+; 64BIT-NEXT: $x6 = LI8 97
+; 64BIT-NEXT: BL8_NOP <mcsymbol .test_mix>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x4, implicit $f2, implicit killed $x6, implicit $x2, implicit-def $r1
+; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+
+define i32 @test_mix(float %f, i32 signext %i, double %d, i8 signext %c) {
+entry:
+  %conv = fpext float %f to double
+  %add = fadd double %conv, %d
+  %conv1 = fptrunc double %add to float
+  %conv2 = zext i8 %c to i32
+  %add3 = add nsw i32 %i, %conv2
+  %conv4 = sitofp i32 %add3 to float
+  %add5 = fadd float %conv4, %conv1
+  %conv6 = fptosi float %add5 to i32
+  ret i32 %conv6
+}
+
+; CHECK-LABEL: name: test_mix{{.*}}
+
+; 32BIT:      liveins:
+; 32BIT-NEXT: - { reg: '$f1', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$f2', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r7', virtual-reg: '' }
+; 32BIT:      body:             |
+; 32BIT-NEXT:   bb.0.entry:
+; 32BIT-NEXT:     liveins: $f1, $f2, $r4, $r7
+
+; 64BIT:      liveins:
+; 64BIT-NEXT: - { reg: '$f1', virtual-reg: '' }
+; 64BIT-NEXT: - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT: - { reg: '$f2', virtual-reg: '' }
+; 64BIT-NEXT: - { reg: '$x6', virtual-reg: '' }
+; 64BIT:      body:             |
+; 64BIT-NEXT:   bb.0.entry:
+; 64BIT-NEXT:     liveins: $f1, $f2, $x4, $x6
+
+
+define i64 @callee_mixed_ints(i32 %a, i8 signext %b, i32 %c, i16 signext %d, i64 %e) {
+entry:
+  %conv = zext i8 %b to i32
+  %add = add nsw i32 %a, %conv
+  %add1 = add nsw i32 %add, %c
+  %conv2 = sext i16 %d to i32
+  %add3 = add nsw i32 %add1, %conv2
+  %conv4 = sext i32 %add3 to i64
+  %add5 = add nsw i64 %conv4, %e
+  ret i64 %add5
+  }
+
+; CHECK-LABEL: name:  callee_mixed_ints
+
+; 32BIT:      liveins:
+; 32BIT-NEXT: - { reg: '$r3', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r4', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r5', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r6', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r7', virtual-reg: '' }
+; 32BIT-NEXT: - { reg: '$r8', virtual-reg: '' }
+; 32BIT:      body:             |
+; 32BIT-NEXT:  bb.0.entry:
+; 32BIT-NEXT:   liveins: $r3, $r4, $r5, $r6, $r7, $r8
+  
+; 64BIT:        liveins:
+; 64BIT-NEXT:   - { reg: '$x3', virtual-reg: '' }
+; 64BIT-NEXT:   - { reg: '$x4', virtual-reg: '' }
+; 64BIT-NEXT:   - { reg: '$x5', virtual-reg: '' }
+; 64BIT-NEXT:   - { reg: '$x6', virtual-reg: '' }
+; 64BIT-NEXT:   - { reg: '$x7', virtual-reg: '' }
+; 64BIT:        body:             |
+; 64BIT-NEXT:    bb.0.entry:
+; 64BIT-NEXT:     liveins: $x3, $x4, $x5, $x6, $x7
diff --git a/llvm/test/CodeGen/PowerPC/aix_fpr_param.ll b/llvm/test/CodeGen/PowerPC/aix_fpr_param.ll
deleted file mode 100644
index f92096f3ab7d3..0000000000000
--- a/llvm/test/CodeGen/PowerPC/aix_fpr_param.ll
+++ /dev/null
@@ -1,150 +0,0 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp < %s | \
-; RUN: FileCheck --check-prefix=32BIT %s
-
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp < %s | \
-; RUN: FileCheck --check-prefix=64BIT %s
-
-@f1 = global float 0.000000e+00, align 4
-@d1 = global double 0.000000e+00, align 8
-
-define void @call_test_float() {
-entry:
-; 32BIT: renamable $r3 = LWZtoc @f1, $r2 :: (load 4 from got)
-; 32BIT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load 4 from @f1)
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: BL_NOP <mcsymbol .test_float>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: renamable $x3 = LDtoc @f1, $x2 :: (load 8 from got)
-; 64BIT: renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load 4 from @f1)
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: BL8_NOP <mcsymbol .test_float>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  %0 = load float, float* @f1, align 4
-  call void @test_float(float %0)
-  ret void
-}
-
-declare void @test_float(float)
-
-define void @call_test_floats() {
-entry:
-; 32BIT: renamable $r3 = LWZtoc @f1, $r2 :: (load 4 from got)
-; 32BIT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load 4 from @f1)
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $f2 = COPY renamable $f1
-; 32BIT: $f3 = COPY renamable $f1
-; 32BIT: BL_NOP <mcsymbol .test_floats>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: renamable $x3 = LDtoc @f1, $x2 :: (load 8 from got)
-; 64BIT: renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load 4 from @f1)
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $f2 = COPY renamable $f1
-; 64BIT: $f3 = COPY renamable $f1
-; 64BIT: BL8_NOP <mcsymbol .test_floats>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  %0 = load float, float* @f1, align 4
-  call void @test_floats(float %0, float %0, float %0)
-  ret void
-}
-
-declare void @test_floats(float, float, float)
-
-define void @call_test_double() {
-entry:
-; 32BIT: renamable $r3 = LWZtoc @d1, $r2 :: (load 4 from got)
-; 32BIT: renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load 8 from @d1)
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: BL_NOP <mcsymbol .test_double>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: renamable $x3 = LDtoc @d1, $x2 :: (load 8 from got)
-; 64BIT: renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load 8 from @d1)
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: BL8_NOP <mcsymbol .test_double>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  %0 = load double, double* @d1, align 8
-  call void @test_double(double %0)
-  ret void
-}
-
-declare void @test_double(double)
-
-define void @call_test_fpr_max() {
-entry:
-; 32BIT: renamable $r3 = LWZtoc @d1, $r2 :: (load 4 from got)
-; 32BIT: renamable $f1 = LFD 0, killed renamable $r3 :: (dereferenceable load 8 from @d1)
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $f2 = COPY renamable $f1
-; 32BIT: $f3 = COPY renamable $f1
-; 32BIT: $f4 = COPY renamable $f1
-; 32BIT: $f5 = COPY renamable $f1
-; 32BIT: $f6 = COPY renamable $f1
-; 32BIT: $f7 = COPY renamable $f1
-; 32BIT: $f8 = COPY renamable $f1
-; 32BIT: $f9 = COPY renamable $f1
-; 32BIT: $f10 = COPY renamable $f1
-; 32BIT: $f11 = COPY renamable $f1
-; 32BIT: $f12 = COPY renamable $f1
-; 32BIT: $f13 = COPY renamable $f1
-; 32BIT: BL_NOP <mcsymbol .test_fpr_max>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: renamable $x3 = LDtoc @d1, $x2 :: (load 8 from got)
-; 64BIT: renamable $f1 = LFD 0, killed renamable $x3 :: (dereferenceable load 8 from @d1)
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $f2 = COPY renamable $f1
-; 64BIT: $f3 = COPY renamable $f1
-; 64BIT: $f4 = COPY renamable $f1
-; 64BIT: $f5 = COPY renamable $f1
-; 64BIT: $f6 = COPY renamable $f1
-; 64BIT: $f7 = COPY renamable $f1
-; 64BIT: $f8 = COPY renamable $f1
-; 64BIT: $f9 = COPY renamable $f1
-; 64BIT: $f10 = COPY renamable $f1
-; 64BIT: $f11 = COPY renamable $f1
-; 64BIT: $f12 = COPY renamable $f1
-; 64BIT: $f13 = COPY renamable $f1
-; 64BIT: BL8_NOP <mcsymbol .test_fpr_max>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit killed $f2, implicit killed $f3, implicit killed $f4, implicit killed $f5, implicit killed $f6, implicit killed $f7, implicit killed $f8, implicit killed $f9, implicit killed $f10, implicit killed $f11, implicit killed $f12, implicit killed $f13, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  %0 = load double, double* @d1, align 8
-  call void @test_fpr_max(double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0, double %0)
-  ret void
-}
-
-declare void @test_fpr_max(double, double, double, double, double, double, double, double, double, double, double, double, double)
-
-define void @call_test_mix() {
-entry:
-; 32BIT: renamable $r3 = LWZtoc @f1, $r2 :: (load 4 from got)
-; 32BIT: renamable $r4 = LWZtoc @d1, $r2 :: (load 4 from got)
-; 32BIT: renamable $f1 = LFS 0, killed renamable $r3 :: (dereferenceable load 4 from @f1)
-; 32BIT: renamable $f2 = LFD 0, killed renamable $r4 :: (dereferenceable load 8 from @d1)
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r4 = LI 1
-; 32BIT: $r7 = LI 97
-; 32BIT: BL_NOP <mcsymbol .test_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $f1, implicit $r4, implicit $f2, implicit killed $r7, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: renamable $x3 = LDtoc @f1, $x2 :: (load 8 from got)
-; 64BIT: renamable $x4 = LDtoc @d1, $x2 :: (load 8 from got)
-; 64BIT: renamable $f1 = LFS 0, killed renamable $x3 :: (dereferenceable load 4 from @f1)
-; 64BIT: renamable $f2 = LFD 0, killed renamable $x4 :: (dereferenceable load 8 from @d1)
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x4 = LI8 1
-; 64BIT: $x6 = LI8 97
-; 64BIT: BL8_NOP <mcsymbol .test_mix>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $f1, implicit $x4, implicit $f2, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  %0 = load float, float* @f1, align 4
-  %1 = load double, double* @d1, align 8
-  call void @test_mix(float %0, i32 1, double %1, i8 signext 97)
-  ret void
-}
-
-declare void @test_mix(float, i32, double, i8 signext)
diff --git a/llvm/test/CodeGen/PowerPC/aix_gpr_param.ll b/llvm/test/CodeGen/PowerPC/aix_gpr_param.ll
deleted file mode 100644
index 42b6f886e687d..0000000000000
--- a/llvm/test/CodeGen/PowerPC/aix_gpr_param.ll
+++ /dev/null
@@ -1,199 +0,0 @@
-; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp < %s | \
-; RUN: FileCheck --check-prefix=32BIT %s
-
-; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp < %s | \
-; RUN: FileCheck --check-prefix=64BIT %s
-
-define void @call_test_char() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 97
-; 32BIT: BL_NOP <mcsymbol .test_char>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 97
-; 64BIT: BL8_NOP <mcsymbol .test_char>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_char(i8 signext 97)
-  ret void
-}
-
-define void @call_test_chars() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 97
-; 32BIT: $r4 = LI 97
-; 32BIT: $r5 = LI 97
-; 32BIT: $r6 = LI 97
-; 32BIT: BL_NOP <mcsymbol .test_chars>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 97
-; 64BIT: $x4 = LI8 97
-; 64BIT: $x5 = LI8 97
-; 64BIT: $x6 = LI8 97
-; 64BIT: BL8_NOP <mcsymbol .test_chars>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_chars(i8 signext 97, i8 signext 97, i8 signext 97, i8 signext 97)
-  ret void
-}
-
-define void @call_test_chars_mix() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 97
-; 32BIT: $r4 = LI 225
-; 32BIT: $r5 = LI 97
-; 32BIT: $r6 = LI -31
-; 32BIT: BL_NOP <mcsymbol .test_chars_mix>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 97
-; 64BIT: $x4 = LI8 225
-; 64BIT: $x5 = LI8 97
-; 64BIT: $x6 = LI8 -31
-; 64BIT: BL8_NOP <mcsymbol .test_chars_mix>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_chars_mix(i8 signext 97, i8 zeroext -31, i8 zeroext 97, i8 signext -31)
-  ret void
-}
-
-define void @call_test_int() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 1
-; 32BIT: BL_NOP <mcsymbol .test_int>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_int>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_int(i32 1)
-  ret void
-}
-
-define void @call_test_ints() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 1
-; 32BIT: $r4 = LI 1
-; 32BIT: $r5 = LI 1
-; 32BIT: $r6 = LI 1
-; 32BIT: $r7 = LI 1
-; 32BIT: $r8 = LI 1
-; 32BIT: $r9 = LI 1
-; 32BIT: $r10 = LI 1
-; 32BIT: BL_NOP <mcsymbol .test_ints>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit killed $r5, implicit killed $r6, implicit killed $r7, implicit killed $r8, implicit killed $r9, implicit killed $r10, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: $x4 = LI8 1
-; 64BIT: $x5 = LI8 1
-; 64BIT: $x6 = LI8 1
-; 64BIT: $x7 = LI8 1
-; 64BIT: $x8 = LI8 1
-; 64BIT: $x9 = LI8 1
-; 64BIT: $x10 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_ints>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit killed $x4, implicit killed $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_ints(i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1)
-  ret void
-}
-
-define void @call_test_ints_64bit() {
-entry:
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: renamable $x3 = LI8 1
-; 64BIT: renamable $x5 = RLDICR killed renamable $x3, 31, 32
-; 64BIT: $x3 = LI8 1
-; 64BIT: $x4 = LI8 1
-; 64BIT: $x6 = LIS8 32768
-; 64BIT: $x7 = LI8 1
-; 64BIT: $x8 = LI8 1
-; 64BIT: $x9 = LI8 1
-; 64BIT: $x10 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_ints_64bit>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit killed $x4, implicit $x5, implicit killed $x6, implicit killed $x7, implicit killed $x8, implicit killed $x9, implicit killed $x10, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_ints_64bit(i32 signext 1, i32 zeroext 1, i32 zeroext 2147483648, i32 signext -2147483648, i32 signext 1, i32 signext 1, i32 signext 1, i32 signext 1)
-  ret void
-}
-
-define void @call_test_i1() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 1
-; 32BIT: BL_NOP <mcsymbol .test_i1>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_i1>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_i1(i1 1)
-  ret void
-}
-
-define void @call_test_i64() {
-entry:
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: $r3 = LI 0
-; 32BIT: $r4 = LI 1
-; 32BIT: BL_NOP <mcsymbol .test_i64>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit killed $r3, implicit killed $r4, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: $x3 = LI8 1
-; 64BIT: BL8_NOP <mcsymbol .test_i64>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit killed $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  call void @test_i64(i64 1)
-  ret void
-}
-
-define void @call_test_int_ptr() {
-entry:
-  %b = alloca i32, align 4
-; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1
-; 32BIT: renamable $r3 = ADDI %stack.0.b, 0
-; 32BIT: BL_NOP <mcsymbol .test_int_ptr>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1
-; 32BIT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1
-
-; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
-; 64BIT: renamable $x3 = ADDI8 %stack.0.b, 0
-; 64BIT: BL8_NOP <mcsymbol .test_int_ptr>, csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1
-; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
-
-  store i32 0, i32* %b, align 4
-  call void @test_int_ptr(i32* %b)
-  ret void
-}
-
-declare void @test_char(i8 signext)
-
-declare void @test_chars(i8 signext, i8 signext, i8 signext, i8 signext)
-
-declare void @test_chars_mix(i8 signext, i8 zeroext, i8 zeroext, i8 signext)
-
-declare void @test_int(i32)
-
-declare void @test_ints(i32, i32, i32, i32, i32, i32, i32, i32)
-
-declare void @test_ints_64bit(i32 signext, i32 zeroext, i32 zeroext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext)
-
-declare void @test_i1(i1)
-
-declare void @test_i64(i64)
-
-declare void @test_int_ptr(i32*)
diff --git a/llvm/test/CodeGen/PowerPC/check-cpu.ll b/llvm/test/CodeGen/PowerPC/check-cpu.ll
new file mode 100644
index 0000000000000..baa39024ebe8d
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/check-cpu.ll
@@ -0,0 +1,11 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=future < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=future < %s | FileCheck %s
+
+
+; Test mcpu=future that should be recognized on PowerPC.
+
+; CHECK-NOT: is not a recognized processor for this target
+; CHECK:     .text
+
diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm-1.ll b/llvm/test/CodeGen/PowerPC/fold-rlwinm-1.ll
index b0586b06cd1fe..12887d8922592 100644
--- a/llvm/test/CodeGen/PowerPC/fold-rlwinm-1.ll
+++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm-1.ll
@@ -11,8 +11,7 @@ define void @foo(i32 signext %var1) {
 ; CHECK-NEXT:    xori r3, r3, 1
 ; CHECK-NEXT:    addis r4, r2, res@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    slwi r3, r3, 19
+; CHECK-NEXT:    rlwinm r3, r3, 14, 0, 12
 ; CHECK-NEXT:    stw r3, res@toc@l(r4)
 ; CHECK-NEXT:    blr
 entry:
@@ -30,10 +29,10 @@ define void @foo_multiple_use(i32 signext %var1) {
 ; CHECK-NEXT:    addis r4, r2, res2@toc@ha
 ; CHECK-NEXT:    addis r6, r2, res@toc@ha
 ; CHECK-NEXT:    cntlzw r3, r3
-; CHECK-NEXT:    srwi r3, r3, 5
-; CHECK-NEXT:    slwi r5, r3, 19
-; CHECK-NEXT:    stw r3, res2@toc@l(r4)
-; CHECK-NEXT:    stw r5, res@toc@l(r6)
+; CHECK-NEXT:    srwi r5, r3, 5
+; CHECK-NEXT:    rlwinm r3, r3, 14, 0, 12
+; CHECK-NEXT:    stw r5, res2@toc@l(r4)
+; CHECK-NEXT:    stw r3, res@toc@l(r6)
 ; CHECK-NEXT:    blr
 entry:
   %cmp = icmp eq i32 %var1, 1
diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
new file mode 100644
index 0000000000000..426aaa7a76313
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
@@ -0,0 +1,140 @@
+# RUN: llc -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu \
+# RUN:   -run-pass ppc-mi-peepholes %s -o - -verify-machineinstrs | FileCheck %s
+
+---
+name: testFoldRLWINM
+#CHECK : name : testFoldRLWINM
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 5, 31
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 31
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    ; CHECK: %3:gprc = RLWINM %1, 14, 0, 12
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMSrcFullMask1
+#CHECK : name : testFoldRLWINMSrcFullMask1
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 0, 31
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 0, 31
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    ; CHECK: %3:gprc = RLWINM %1, 14, 0, 12
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMSrcFullMask2
+#CHECK : name : testFoldRLWINMSrcFullMask2
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 10, 9 
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 10, 9
+    %3:gprc = RLWINM %2:gprc, 19, 10, 1
+    ; CHECK: %3:gprc = RLWINM %1, 14, 10, 1
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMSrcWrapped
+#CHECK : name : testFoldRLWINMSrcWrapped
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 30, 10
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 30 ,10 
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    ; CHECK: %3:gprc = RLWINM %1, 14, 11, 12
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMUserWrapped
+#CHECK : name : testFoldRLWINMUserWrapped
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 10, 5, 31
+    ; CHECKT: %2:gprc = RLWINM %1:gprc, 10, 5, 31
+    %3:gprc = RLWINM %2:gprc, 10, 30, 5
+    ; CHECK: %3:gprc = RLWINM %2, 10, 30, 5
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMMultipleUses
+#CHECK : name : testFoldRLWINMMultipleUses
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM killed %1:gprc, 27, 5, 31
+    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 31
+    %3:gprc = RLWINM %2:gprc, 19, 0, 12
+    ; CHECK: %3:gprc = RLWINM killed %1, 14, 0, 12
+    STW %3:gprc, %2:gprc, 100 
+    ; CHECK: STW %3, %2, 100
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMToZero
+#CHECK : name : testFoldRLWINMToZero
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    %3:gprc = RLWINM %2:gprc, 8, 5, 10
+    ; CHECK: %3:gprc = LI 0
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMoToZero
+#CHECK : name : testFoldRLWINMoToZero
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    ; CHECK-NOT: %2:gprc = RLWINM %1:gprc, 27, 5, 10
+    %3:gprc = RLWINMo %2:gprc, 8, 5, 10, implicit-def $cr0
+    ; CHECK: %3:gprc = ANDIo %2, 0, implicit-def $cr0
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMInvalidMask
+#CHECK : name : testFoldRLWINMInvalidMask
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM %1:gprc, 20, 5, 31
+    ; CHECK: %2:gprc = RLWINM %1, 20, 5, 31
+    %3:gprc = RLWINM %2:gprc, 19, 10, 20
+    ; CHECK: %3:gprc = RLWINM %2, 19, 10, 20
+    BLR8 implicit $lr8, implicit $rm
+...
diff --git a/llvm/test/CodeGen/PowerPC/lower-globaladdr32-aix-asm.ll b/llvm/test/CodeGen/PowerPC/lower-globaladdr32-aix-asm.ll
index e48f43a2d4b32..e3254175dbe96 100644
--- a/llvm/test/CodeGen/PowerPC/lower-globaladdr32-aix-asm.ll
+++ b/llvm/test/CodeGen/PowerPC/lower-globaladdr32-aix-asm.ll
@@ -41,5 +41,5 @@ define void @test_store(i32 %0) {
 ; LARGE: stw [[REG3:[0-9]+]], 0([[REG2]])
 ; LARGE: blr
 
-; TODO Update test when TOC-entry emission lands.
-; CHECK-NOT: .tc
+; CHECK: .tc a[TC],a
+; CHECK: .tc b[TC],b
diff --git a/llvm/test/CodeGen/PowerPC/lower-globaladdr64-aix-asm.ll b/llvm/test/CodeGen/PowerPC/lower-globaladdr64-aix-asm.ll
index 371fa0ec279e3..6d1863bc95371 100644
--- a/llvm/test/CodeGen/PowerPC/lower-globaladdr64-aix-asm.ll
+++ b/llvm/test/CodeGen/PowerPC/lower-globaladdr64-aix-asm.ll
@@ -41,5 +41,5 @@ define void @test_store(i32 zeroext %0) {
 ; LARGE: stw [[REG3:[0-9]+]], 0([[REG2]])
 ; LARGE: blr
 
-; TODO Update test when TOC-entry emission lands.
-; CHECK-NOT: .tc
+; CHECK: .tc a[TC],a
+; CHECK: .tc b[TC],b
diff --git a/llvm/test/CodeGen/PowerPC/machine-backward-cp.mir b/llvm/test/CodeGen/PowerPC/machine-backward-cp.mir
index bb2c29fc906d7..0374d55c0cb4d 100644
--- a/llvm/test/CodeGen/PowerPC/machine-backward-cp.mir
+++ b/llvm/test/CodeGen/PowerPC/machine-backward-cp.mir
@@ -11,8 +11,7 @@ tracksRegLiveness: true
 body: |
   bb.0.entry:
     ; CHECK-LABEL: name: test0
-    ; CHECK: renamable $x4 = LI8 1024
-    ; CHECK: $x3 = COPY killed renamable $x4
+    ; CHECK: $x3 = LI8 1024
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x4 = LI8 1024
     $x3 = COPY renamable killed $x4
@@ -28,8 +27,7 @@ tracksRegLiveness: true
 body: |
   ; CHECK-LABEL: name: test1
   ; CHECK: bb.0.entry:
-  ; CHECK:   renamable $x5 = LI8 42
-  ; CHECK:   renamable $x4 = COPY killed renamable $x5
+  ; CHECK:   renamable $x4 = LI8 42
   ; CHECK:   B %bb.1
   ; CHECK: bb.1:
   ; CHECK:   liveins: $x4
@@ -139,8 +137,8 @@ body: |
 
     ; CHECK-LABEL: name: iterative_deletion
     ; CHECK: liveins: $x5
-    ; CHECK: renamable $x6 = ADDI8 killed renamable $x5, 1
-    ; CHECK: $x3 = COPY $x6
+    ; CHECK: renamable $x4 = ADDI8 killed renamable $x5, 1
+    ; CHECK: $x3 = COPY $x4
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x6 = ADDI8 renamable killed $x5, 1
     renamable $x4 = COPY renamable killed $x6
@@ -160,8 +158,8 @@ body: |
     ; CHECK-LABEL: name: Enter
     ; CHECK: liveins: $x4, $x7
     ; CHECK: renamable $x5 = COPY killed renamable $x7
-    ; CHECK: renamable $x6 = ADDI8 killed renamable $x4, 1
-    ; CHECK: $x3 = ADD8 killed renamable $x5, $x6
+    ; CHECK: renamable $x7 = ADDI8 killed renamable $x4, 1
+    ; CHECK: $x3 = ADD8 killed renamable $x5, killed renamable $x7
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x5 = COPY killed renamable $x7
     renamable $x6 = ADDI8 killed renamable $x4, 1
@@ -181,10 +179,9 @@ body: |
     ; CHECK-LABEL: name: foo
     ; CHECK: liveins: $x4, $x7
     ; CHECK: renamable $x5 = COPY killed renamable $x7
-    ; CHECK: renamable $x6 = ADDI8 renamable $x4, 1
-    ; CHECK: renamable $x7 = COPY killed renamable $x6
-    ; CHECK: renamable $x8 = ADDI8 killed $x4, 2
-    ; CHECK: $x3 = ADD8 killed renamable $x5, $x8
+    ; CHECK: renamable $x7 = ADDI8 renamable $x4, 1
+    ; CHECK: renamable $x6 = ADDI8 killed $x4, 2
+    ; CHECK: $x3 = ADD8 killed renamable $x5, killed renamable $x6
     ; CHECK: $x3 = ADD8 $x3, killed renamable $x7
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x5 = COPY killed renamable $x7
@@ -208,10 +205,10 @@ body: |
     ; CHECK-LABEL: name: bar
     ; CHECK: liveins: $x4, $x7
     ; CHECK: renamable $x5 = COPY killed renamable $x7
-    ; CHECK: renamable $x6 = ADDI8 renamable $x4, 1
-    ; CHECK: renamable $x8 = COPY $x6
-    ; CHECK: renamable $x6 = ADDI8 renamable $x5, 2
-    ; CHECK: $x3 = ADD8 killed renamable $x5, $x6
+    ; CHECK: renamable $x7 = ADDI8 renamable $x4, 1
+    ; CHECK: renamable $x8 = COPY killed renamable $x7
+    ; CHECK: renamable $x7 = ADDI8 renamable $x5, 2
+    ; CHECK: $x3 = ADD8 killed renamable $x5, killed renamable $x7
     ; CHECK: $x3 = ADD8 $x3, killed renamable $x8
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x5 = COPY killed renamable $x7
@@ -236,10 +233,9 @@ body: |
     ; CHECK-LABEL: name: bogus
     ; CHECK: liveins: $x7
     ; CHECK: renamable $x5 = COPY renamable $x7
-    ; CHECK: renamable $x6 = ADDI8 $x7, 1
-    ; CHECK: renamable $x7 = COPY $x6
+    ; CHECK: renamable $x4 = ADDI8 $x7, 1
     ; CHECK: renamable $x6 = ADDI8 renamable $x5, 2
-    ; CHECK: $x3 = ADD8 $x7, killed renamable $x5
+    ; CHECK: $x3 = ADD8 killed renamable $x4, killed renamable $x5
     ; CHECK: $x3 = ADD8 $x3, killed renamable $x6
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x5 = COPY killed renamable $x7
@@ -263,10 +259,10 @@ body: |
     liveins: $x7
     ; CHECK-LABEL: name: foobar
     ; CHECK: liveins: $x7
-    ; CHECK: renamable $x6 = ADDI8 $x7, 1
-    ; CHECK: renamable $x8 = COPY $x6
-    ; CHECK: renamable $x6 = ADDI8 $x7, 2
-    ; CHECK: $x3 = ADD8 $x6, $x7
+    ; CHECK: renamable $x4 = ADDI8 $x7, 1
+    ; CHECK: renamable $x8 = COPY killed renamable $x4
+    ; CHECK: renamable $x4 = ADDI8 $x7, 2
+    ; CHECK: $x3 = ADD8 killed renamable $x4, $x7
     ; CHECK: $x3 = ADD8 $x3, killed renamable $x8
     ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
     renamable $x5 = COPY killed renamable $x7
@@ -280,3 +276,22 @@ body: |
     BLR8 implicit $lr8, implicit undef $rm, implicit $x3
 
 ...
+
+---
+name: cross_call
+alignment: 4
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x2, $x3, $x20
+    ; CHECK-LABEL: name: cross_call
+    ; CHECK: liveins: $x2, $x3, $x20
+    ; CHECK: renamable $x20 = LI8 1024
+    ; CHECK: BL8_NOP @foo, csr_svr464_altivec, implicit-def $lr8, implicit $rm, implicit $x3, implicit-def $x3, implicit $x2
+    ; CHECK: $x3 = COPY killed renamable $x20
+    ; CHECK: BLR8 implicit $lr8, implicit undef $rm, implicit $x3
+    renamable $x20 = LI8 1024
+    BL8_NOP @foo, csr_svr464_altivec, implicit-def $lr8, implicit $rm, implicit $x3, implicit-def $x3, implicit $x2
+    $x3 = COPY renamable killed $x20
+    BLR8 implicit $lr8, implicit undef $rm, implicit $x3
+...
diff --git a/llvm/test/CodeGen/PowerPC/ppc-passname.ll b/llvm/test/CodeGen/PowerPC/ppc-passname.ll
index 005f0a25c5637..98343bdb535c2 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-passname.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-passname.ll
@@ -1,13 +1,13 @@
-; Test pass name: ppc-loop-preinc-prep.
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
-; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep
-; STOP-BEFORE-LOOP-PREINC-PREP-NOT: "ppc-loop-preinc-prep" pass is not registered.
-; STOP-BEFORE-LOOP-PREINC-PREP-NOT: Prepare loop for pre-inc. addressing modes
-
-; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-LOOP-PREINC-PREP
-; STOP-AFTER-LOOP-PREINC-PREP: -ppc-loop-preinc-prep
-; STOP-AFTER-LOOP-PREINC-PREP-NOT: "ppc-loop-preinc-prep" pass is not registered.
-; STOP-AFTER-LOOP-PREINC-PREP: Prepare loop for pre-inc. addressing modes
+; Test pass name: ppc-loop-instr-form-prep.
+; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-instr-form-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-INSTR-FORM-PREP
+; STOP-BEFORE-LOOP-INSTR-FORM-PREP-NOT: -ppc-loop-instr-form-prep
+; STOP-BEFORE-LOOP-INSTR-FORM-PREP-NOT: "ppc-loop-instr-form-prep" pass is not registered.
+; STOP-BEFORE-LOOP-INSTR-FORM-PREP-NOT: Prepare loop for ppc preferred instruction forms
+
+; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-loop-instr-form-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-LOOP-INSTR-FORM-PREP
+; STOP-AFTER-LOOP-INSTR-FORM-PREP: -ppc-loop-instr-form-prep
+; STOP-AFTER-LOOP-INSTR-FORM-PREP-NOT: "ppc-loop-instr-form-prep" pass is not registered.
+; STOP-AFTER-LOOP-INSTR-FORM-PREP: Prepare loop for ppc preferred instruction forms
 
 
 ; Test pass name: ppc-toc-reg-deps.
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
new file mode 100644
index 0000000000000..179ddc1980a94
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
@@ -0,0 +1,1569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -mtriple=powerpc64le-linux-gnu < %s | FileCheck --check-prefix=PC64LE %s
+; RUN: llc -O3 -mtriple=powerpc64le-linux-gnu -mcpu=pwr9 < %s | FileCheck --check-prefix=PC64LE9 %s
+; RUN: llc -O3 -mtriple=powerpc64-linux-gnu < %s | FileCheck --check-prefix=PC64 %s
+
+define ppc_fp128 @test_fadd_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_fadd_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __gcc_qadd
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fadd_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __gcc_qadd
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fadd_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __gcc_qadd
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %add = call ppc_fp128 @llvm.experimental.constrained.fadd.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %add
+}
+
+define ppc_fp128 @test_fsub_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_fsub_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __gcc_qsub
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fsub_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __gcc_qsub
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fsub_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __gcc_qsub
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %sub = call ppc_fp128 @llvm.experimental.constrained.fsub.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %sub
+}
+
+define ppc_fp128 @test_fmul_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_fmul_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __gcc_qmul
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fmul_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __gcc_qmul
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fmul_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __gcc_qmul
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %mul = call ppc_fp128 @llvm.experimental.constrained.fmul.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %mul
+}
+
+define ppc_fp128 @test_fdiv_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_fdiv_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __gcc_qdiv
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fdiv_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __gcc_qdiv
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fdiv_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __gcc_qdiv
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %div = call ppc_fp128 @llvm.experimental.constrained.fdiv.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %div
+}
+
+define ppc_fp128 @test_frem_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_frem_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl fmodl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_frem_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl fmodl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_frem_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl fmodl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %rem = call ppc_fp128 @llvm.experimental.constrained.frem.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %rem
+}
+
+define ppc_fp128 @test_fma_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second, ppc_fp128 %third) nounwind {
+; PC64LE-LABEL: test_fma_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl fmal
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fma_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl fmal
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fma_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl fmal
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %add = call ppc_fp128 @llvm.experimental.constrained.fma.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    ppc_fp128 %third,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %add
+}
+
+define ppc_fp128 @test_sqrt_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_sqrt_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl sqrtl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_sqrt_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl sqrtl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_sqrt_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl sqrtl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %sqrt = call ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %sqrt
+}
+
+define ppc_fp128 @test_pow_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_pow_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl powl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_pow_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl powl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_pow_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl powl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %pow = call ppc_fp128 @llvm.experimental.constrained.pow.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %pow
+}
+
+define ppc_fp128 @test_powi_ppc_fp128(ppc_fp128 %first, i32 %second) nounwind {
+; PC64LE-LABEL: test_powi_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    clrldi 5, 5, 32
+; PC64LE-NEXT:    bl __powitf2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_powi_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    clrldi 5, 5, 32
+; PC64LE9-NEXT:    bl __powitf2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_powi_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    clrldi 5, 5, 32
+; PC64-NEXT:    bl __powitf2
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+; PC64LE9     :    clrldi 5, 5, 32
+entry:
+  %powi = call ppc_fp128 @llvm.experimental.constrained.powi.ppcf128(
+                    ppc_fp128 %first,
+                    i32 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %powi
+}
+
+define ppc_fp128 @test_sin_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_sin_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl sinl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_sin_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl sinl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_sin_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl sinl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %sin = call ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %sin
+}
+
+define ppc_fp128 @test_cos_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_cos_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl cosl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_cos_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl cosl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_cos_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl cosl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %cos = call ppc_fp128 @llvm.experimental.constrained.cos.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %cos
+}
+
+define ppc_fp128 @test_exp_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_exp_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl expl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_exp_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl expl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_exp_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl expl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %exp = call ppc_fp128 @llvm.experimental.constrained.exp.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %exp
+}
+
+define ppc_fp128 @test_exp2_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_exp2_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl exp2l
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_exp2_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl exp2l
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_exp2_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl exp2l
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %exp2 = call ppc_fp128 @llvm.experimental.constrained.exp2.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %exp2
+}
+
+define ppc_fp128 @test_log_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_log_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl logl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_log_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl logl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_log_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl logl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %log = call ppc_fp128 @llvm.experimental.constrained.log.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %log
+}
+
+define ppc_fp128 @test_log2_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_log2_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl log2l
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_log2_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl log2l
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_log2_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl log2l
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %log2 = call ppc_fp128 @llvm.experimental.constrained.log2.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %log2
+}
+
+define ppc_fp128 @test_log10_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_log10_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl log10l
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_log10_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl log10l
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_log10_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl log10l
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %log10 = call ppc_fp128 @llvm.experimental.constrained.log10.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %log10
+}
+
+define ppc_fp128 @test_rint_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_rint_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl rintl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_rint_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl rintl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_rint_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl rintl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %rint = call ppc_fp128 @llvm.experimental.constrained.rint.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %rint
+}
+
+define ppc_fp128 @test_nearbyint_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_nearbyint_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl nearbyintl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_nearbyint_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl nearbyintl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_nearbyint_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl nearbyintl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %nearbyint = call ppc_fp128 @llvm.experimental.constrained.nearbyint.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %nearbyint
+}
+
+define ppc_fp128 @test_maxnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_maxnum_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl fmaxl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_maxnum_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl fmaxl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_maxnum_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl fmaxl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %maxnum = call ppc_fp128 @llvm.experimental.constrained.maxnum.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %maxnum
+}
+
+define ppc_fp128 @test_minnum_ppc_fp128(ppc_fp128 %first, ppc_fp128 %second) nounwind {
+; PC64LE-LABEL: test_minnum_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl fminl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_minnum_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl fminl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_minnum_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl fminl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %minnum = call ppc_fp128 @llvm.experimental.constrained.minnum.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %second,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %minnum
+}
+
+define ppc_fp128 @test_ceil_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_ceil_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl ceill
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_ceil_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl ceill
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_ceil_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl ceill
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %ceil = call ppc_fp128 @llvm.experimental.constrained.ceil.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %ceil
+}
+
+define ppc_fp128 @test_floor_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_floor_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl floorl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_floor_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl floorl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_floor_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl floorl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %floor = call ppc_fp128 @llvm.experimental.constrained.floor.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %floor
+}
+
+define ppc_fp128 @test_round_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_round_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl roundl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_round_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl roundl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_round_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl roundl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %round = call ppc_fp128 @llvm.experimental.constrained.round.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %round
+}
+
+define ppc_fp128 @test_trunc_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_trunc_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl truncl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_trunc_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl truncl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_trunc_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl truncl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %trunc = call ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %trunc
+}
+
+define float @test_fptrunc_ppc_fp128_f32(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptrunc_ppc_fp128_f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    frsp 1, 1
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptrunc_ppc_fp128_f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    frsp 1, 1
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptrunc_ppc_fp128_f32:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    frsp 1, 1
+; PC64-NEXT:    blr
+entry:
+  %fptrunc = call float @llvm.experimental.constrained.fptrunc.ppcf128.f32(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret float %fptrunc
+}
+
+define double @test_fptrunc_ppc_fp128_f64(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptrunc_ppc_fp128_f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptrunc_ppc_fp128_f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptrunc_ppc_fp128_f64:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    blr
+entry:
+  %fptrunc = call double @llvm.experimental.constrained.fptrunc.ppcf128.f64(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  ret double %fptrunc
+}
+
+define ppc_fp128 @test_fpext_ppc_fp128_f32(float %first) nounwind {
+; PC64LE-LABEL: test_fpext_ppc_fp128_f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    xxlxor 2, 2, 2
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fpext_ppc_fp128_f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    xxlxor 2, 2, 2
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fpext_ppc_fp128_f32:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    addis 3, 2, .LCPI26_0@toc@ha
+; PC64-NEXT:    lfs 2, .LCPI26_0@toc@l(3)
+; PC64-NEXT:    blr
+entry:
+  %fpext = call ppc_fp128 @llvm.experimental.constrained.fpext.f32.ppcf128(
+                    float %first,
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %fpext
+}
+
+define ppc_fp128 @test_fpext_ppc_fp128_f64(double %first) nounwind {
+; PC64LE-LABEL: test_fpext_ppc_fp128_f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    xxlxor 2, 2, 2
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fpext_ppc_fp128_f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    xxlxor 2, 2, 2
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fpext_ppc_fp128_f64:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    addis 3, 2, .LCPI27_0@toc@ha
+; PC64-NEXT:    lfs 2, .LCPI27_0@toc@l(3)
+; PC64-NEXT:    blr
+entry:
+  %fpext = call ppc_fp128 @llvm.experimental.constrained.fpext.f64.ppcf128(
+                    double %first,
+                    metadata !"fpexcept.strict")
+  ret ppc_fp128 %fpext
+}
+
+define i64 @test_fptosi_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptosi_ppc_i64_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __fixtfdi
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptosi_ppc_i64_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __fixtfdi
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptosi_ppc_i64_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __fixtfdi
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %fpext = call i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"fpexcept.strict")
+  ret i64 %fpext
+}
+
+define i32 @test_fptosi_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptosi_ppc_i32_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __gcc_qtou
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptosi_ppc_i32_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __gcc_qtou
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptosi_ppc_i32_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __gcc_qtou
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %fpext = call i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(
+                    ppc_fp128  %first,
+                    metadata !"fpexcept.strict")
+  ret i32 %fpext
+}
+
+define i64 @test_fptoui_ppc_i64_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptoui_ppc_i64_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __fixunstfdi
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptoui_ppc_i64_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __fixunstfdi
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptoui_ppc_i64_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __fixunstfdi
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %fpext = call i64 @llvm.experimental.constrained.fptoui.i64.ppcf128(
+                    ppc_fp128   %first,
+                    metadata !"fpexcept.strict")
+  ret i64 %fpext
+}
+
+define i32 @test_fptoui_ppc_i32_ppc_fp128(ppc_fp128 %first) nounwind {
+; PC64LE-LABEL: test_fptoui_ppc_i32_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    bl __fixunstfsi
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_fptoui_ppc_i32_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    bl __fixunstfsi
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_fptoui_ppc_i32_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    bl __fixunstfsi
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %fpext = call i32 @llvm.experimental.constrained.fptoui.i32.ppcf128(
+                    ppc_fp128   %first,
+                    metadata !"fpexcept.strict")
+  ret i32 %fpext
+}
+
+; Test that resultant libcalls retain order even when their non-strict FLOP form could be
+; trivially optimized into differing sequences.
+define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %result) nounwind {
+; PC64LE-LABEL: test_constrained_libcall_multichain:
+; PC64LE:       # %bb.0:
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    std 29, -48(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    std 30, -40(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 29, -24(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 30, -16(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 31, -8(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    std 0, 16(1)
+; PC64LE-NEXT:    stdu 1, -80(1)
+; PC64LE-NEXT:    mr 29, 3
+; PC64LE-NEXT:    xxlxor 2, 2, 2
+; PC64LE-NEXT:    li 3, 0
+; PC64LE-NEXT:    mr 30, 4
+; PC64LE-NEXT:    lfsx 31, 0, 29
+; PC64LE-NEXT:    xxlxor 4, 4, 4
+; PC64LE-NEXT:    std 3, 8(4)
+; PC64LE-NEXT:    fmr 1, 31
+; PC64LE-NEXT:    fmr 3, 31
+; PC64LE-NEXT:    stfdx 31, 0, 4
+; PC64LE-NEXT:    bl __gcc_qadd
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    fmr 3, 1
+; PC64LE-NEXT:    fmr 4, 2
+; PC64LE-NEXT:    fmr 30, 1
+; PC64LE-NEXT:    fmr 29, 2
+; PC64LE-NEXT:    stfd 1, 16(30)
+; PC64LE-NEXT:    stfd 2, 24(30)
+; PC64LE-NEXT:    bl __gcc_qmul
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    fmr 1, 31
+; PC64LE-NEXT:    xxlxor 2, 2, 2
+; PC64LE-NEXT:    li 5, 2
+; PC64LE-NEXT:    stfd 30, 32(30)
+; PC64LE-NEXT:    stfd 29, 40(30)
+; PC64LE-NEXT:    bl __powitf2
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    frsp 0, 1
+; PC64LE-NEXT:    stfsx 0, 0, 29
+; PC64LE-NEXT:    stfd 2, -8(30)
+; PC64LE-NEXT:    stfd 1, -16(30)
+; PC64LE-NEXT:    addi 1, 1, 80
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    lfd 31, -8(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    lfd 30, -16(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    ld 30, -40(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    ld 29, -48(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    lfd 29, -24(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_constrained_libcall_multichain:
+; PC64LE9:       # %bb.0:
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    std 29, -48(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    std 30, -40(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stfd 29, -24(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stfd 30, -16(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stfd 31, -8(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    std 0, 16(1)
+; PC64LE9-NEXT:    stdu 1, -80(1)
+; PC64LE9-NEXT:    lfs 31, 0(3)
+; PC64LE9-NEXT:    mr 29, 3
+; PC64LE9-NEXT:    li 3, 0
+; PC64LE9-NEXT:    xxlxor 2, 2, 2
+; PC64LE9-NEXT:    xxlxor 4, 4, 4
+; PC64LE9-NEXT:    std 3, 8(4)
+; PC64LE9-NEXT:    fmr 1, 31
+; PC64LE9-NEXT:    fmr 3, 31
+; PC64LE9-NEXT:    mr 30, 4
+; PC64LE9-NEXT:    stfd 31, 0(4)
+; PC64LE9-NEXT:    bl __gcc_qadd
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    fmr 3, 1
+; PC64LE9-NEXT:    fmr 4, 2
+; PC64LE9-NEXT:    fmr 30, 2
+; PC64LE9-NEXT:    fmr 29, 1
+; PC64LE9-NEXT:    stfd 1, 16(30)
+; PC64LE9-NEXT:    stfd 2, 24(30)
+; PC64LE9-NEXT:    bl __gcc_qmul
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    fmr 1, 31
+; PC64LE9-NEXT:    xxlxor 2, 2, 2
+; PC64LE9-NEXT:    li 5, 2
+; PC64LE9-NEXT:    stfd 29, 32(30)
+; PC64LE9-NEXT:    stfd 30, 40(30)
+; PC64LE9-NEXT:    bl __powitf2
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    frsp 0, 1
+; PC64LE9-NEXT:    stfs 0, 0(29)
+; PC64LE9-NEXT:    stfd 2, -8(30)
+; PC64LE9-NEXT:    stfd 1, -16(30)
+; PC64LE9-NEXT:    addi 1, 1, 80
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    lfd 31, -8(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    lfd 30, -16(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    lfd 29, -24(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    ld 30, -40(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    ld 29, -48(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_constrained_libcall_multichain:
+; PC64:       # %bb.0:
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    std 0, 16(1)
+; PC64-NEXT:    stdu 1, -176(1)
+; PC64-NEXT:    std 29, 120(1) # 8-byte Folded Spill
+; PC64-NEXT:    mr 29, 3
+; PC64-NEXT:    li 3, 0
+; PC64-NEXT:    stfd 31, 168(1) # 8-byte Folded Spill
+; PC64-NEXT:    stfd 30, 160(1) # 8-byte Folded Spill
+; PC64-NEXT:    std 30, 128(1) # 8-byte Folded Spill
+; PC64-NEXT:    stfd 28, 144(1) # 8-byte Folded Spill
+; PC64-NEXT:    stfd 29, 152(1) # 8-byte Folded Spill
+; PC64-NEXT:    mr 30, 4
+; PC64-NEXT:    lfs 31, 0(29)
+; PC64-NEXT:    std 3, 8(4)
+; PC64-NEXT:    addis 3, 2, .LCPI32_0@toc@ha
+; PC64-NEXT:    lfs 30, .LCPI32_0@toc@l(3)
+; PC64-NEXT:    fmr 1, 31
+; PC64-NEXT:    fmr 3, 31
+; PC64-NEXT:    fmr 2, 30
+; PC64-NEXT:    fmr 4, 30
+; PC64-NEXT:    stfd 31, 0(4)
+; PC64-NEXT:    bl __gcc_qadd
+; PC64-NEXT:    nop
+; PC64-NEXT:    fmr 3, 1
+; PC64-NEXT:    fmr 4, 2
+; PC64-NEXT:    fmr 29, 1
+; PC64-NEXT:    fmr 28, 2
+; PC64-NEXT:    stfd 1, 16(30)
+; PC64-NEXT:    stfd 2, 24(30)
+; PC64-NEXT:    bl __gcc_qmul
+; PC64-NEXT:    nop
+; PC64-NEXT:    fmr 1, 31
+; PC64-NEXT:    fmr 2, 30
+; PC64-NEXT:    li 5, 2
+; PC64-NEXT:    stfd 29, 32(30)
+; PC64-NEXT:    stfd 28, 40(30)
+; PC64-NEXT:    bl __powitf2
+; PC64-NEXT:    nop
+; PC64-NEXT:    frsp 0, 1
+; PC64-NEXT:    stfs 0, 0(29)
+; PC64-NEXT:    lfd 31, 168(1) # 8-byte Folded Reload
+; PC64-NEXT:    lfd 30, 160(1) # 8-byte Folded Reload
+; PC64-NEXT:    lfd 29, 152(1) # 8-byte Folded Reload
+; PC64-NEXT:    lfd 28, 144(1) # 8-byte Folded Reload
+; PC64-NEXT:    ld 29, 120(1) # 8-byte Folded Reload
+; PC64-NEXT:    stfd 2, -8(30)
+; PC64-NEXT:    stfd 1, -16(30)
+; PC64-NEXT:    ld 30, 128(1) # 8-byte Folded Reload
+; PC64-NEXT:    addi 1, 1, 176
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+  %load = load float, float* %firstptr
+  %first = call ppc_fp128 @llvm.experimental.constrained.fpext.f32.ppcf128(
+                    float %load,
+                    metadata !"fpexcept.strict")
+  store ppc_fp128 %first, ppc_fp128* %result
+
+  ; For unconstrained FLOPs, these next two FP instructions would necessarily
+  ; be executed in series with one another.
+  %fadd = call ppc_fp128 @llvm.experimental.constrained.fadd.ppcf128(
+                    ppc_fp128 %first,
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  %stridx1 = getelementptr ppc_fp128, ppc_fp128* %result, i32 1
+  store ppc_fp128 %fadd, ppc_fp128* %stridx1
+  %fmul = call ppc_fp128 @llvm.experimental.constrained.fmul.ppcf128(
+                    ppc_fp128 %fadd,
+                    ppc_fp128 %fadd,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  %stridx2 = getelementptr ppc_fp128, ppc_fp128* %stridx1, i32 1
+  store ppc_fp128 %fadd, ppc_fp128* %stridx2
+
+  ; For unconstrained FLOPs, these next two FP instructions could be reordered
+  ; or even executed in parallel with respect to the previous two instructions.
+  ; However, strict floating point rules would not allow this.
+  %powi = call ppc_fp128 @llvm.experimental.constrained.powi.ppcf128(
+                    ppc_fp128 %first,
+                    i32 2,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  %tinypow = call float @llvm.experimental.constrained.fptrunc.ppcf128.f32(
+                    ppc_fp128 %powi,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict")
+  store float %tinypow, float* %firstptr
+  %stridxn1 = getelementptr ppc_fp128, ppc_fp128* %result, i32 -1
+  store ppc_fp128 %powi, ppc_fp128* %stridxn1
+  ret void
+}
+
+declare ppc_fp128 @llvm.experimental.constrained.fadd.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.ceil.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.cos.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fdiv.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.exp.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.exp2.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.floor.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fma.ppcf128(ppc_fp128, ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fpext.f32.ppcf128(float, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fpext.f64.ppcf128(double, metadata)
+declare float @llvm.experimental.constrained.fptrunc.ppcf128.f32(ppc_fp128, metadata, metadata)
+declare double @llvm.experimental.constrained.fptrunc.ppcf128.f64(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.log.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.log10.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.log2.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.maxnum.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.minnum.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fmul.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.nearbyint.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.pow.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.powi.ppcf128(ppc_fp128, i32, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.frem.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.rint.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.round.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.fsub.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(ppc_fp128, metadata, metadata)
+declare i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(ppc_fp128, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(ppc_fp128, metadata)
+declare i64 @llvm.experimental.constrained.fptoui.i64.ppcf128(ppc_fp128, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.ppcf128(ppc_fp128, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/pr36292.ll b/llvm/test/CodeGen/PowerPC/pr36292.ll
index 883d26b669088..a859121bb505c 100644
--- a/llvm/test/CodeGen/PowerPC/pr36292.ll
+++ b/llvm/test/CodeGen/PowerPC/pr36292.ll
@@ -15,8 +15,7 @@ define void @test() nounwind comdat {
 ; CHECK-NEXT:    ld 29, 0(3)
 ; CHECK-NEXT:    ld 30, 32(1)
 ; CHECK-NEXT:    cmpld 30, 29
-; CHECK-NEXT:    bge- 0, .LBB0_2
-; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:    bge 0, .LBB0_2
 ; CHECK-NEXT:  .LBB0_1: # %bounds.ok
 ; CHECK-NEXT:    #
 ; CHECK-NEXT:    lfsx 2, 0, 3
@@ -26,7 +25,7 @@ define void @test() nounwind comdat {
 ; CHECK-NEXT:    addi 30, 30, 1
 ; CHECK-NEXT:    stfsx 1, 0, 3
 ; CHECK-NEXT:    cmpld 30, 29
-; CHECK-NEXT:    blt+ 0, .LBB0_1
+; CHECK-NEXT:    blt 0, .LBB0_1
 ; CHECK-NEXT:  .LBB0_2: # %bounds.fail
 ; CHECK-NEXT:    std 30, 32(1)
   %pos = alloca i64, align 8
diff --git a/llvm/test/CodeGen/PowerPC/pr44183.ll b/llvm/test/CodeGen/PowerPC/pr44183.ll
new file mode 100644
index 0000000000000..1a6f932bc6d07
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/pr44183.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s
+%struct.m.2.5.8.11 = type { %struct.l.0.3.6.9, [7 x i8], %struct.a.1.4.7.10 }
+%struct.l.0.3.6.9 = type { i8 }
+%struct.a.1.4.7.10 = type { [27 x i8], [0 x i32], [4 x i8] }
+define void @_ZN1m1nEv(%struct.m.2.5.8.11* %this) local_unnamed_addr nounwind align 2 {
+; CHECK-LABEL: _ZN1m1nEv:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-NEXT:    std r0, 16(r1)
+; CHECK-NEXT:    stdu r1, -48(r1)
+; CHECK-NEXT:    mr r30, r3
+; CHECK-NEXT:    ld r4, 8(r30)
+; CHECK-NEXT:    lwz r5, 36(r30)
+; CHECK-NEXT:    rldicl r4, r4, 60, 4
+; CHECK-NEXT:    rlwinm r3, r4, 31, 0, 0
+; CHECK-NEXT:    rlwinm r4, r5, 0, 31, 31
+; CHECK-NEXT:    or r4, r4, r3
+; CHECK-NEXT:    bl _ZN1llsE1d
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    ld r3, 16(r30)
+; CHECK-NEXT:    ld r4, 8(r30)
+; CHECK-NEXT:    rldicl r4, r4, 60, 4
+; CHECK-NEXT:    sldi r3, r3, 60
+; CHECK-NEXT:    or r3, r4, r3
+; CHECK-NEXT:    sldi r3, r3, 31
+; CHECK-NEXT:    clrldi r4, r3, 32
+; CHECK-NEXT:    bl _ZN1llsE1d
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 48
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-NEXT:    blr
+entry:
+  %bc = getelementptr inbounds %struct.m.2.5.8.11, %struct.m.2.5.8.11* %this, i64 0, i32 2
+  %0 = bitcast %struct.a.1.4.7.10* %bc to i216*
+  %bf.load = load i216, i216* %0, align 8
+  %bf.lshr = lshr i216 %bf.load, 4
+  %shl.i23 = shl i216 %bf.lshr, 31
+  %shl.i = trunc i216 %shl.i23 to i32
+  %arrayidx = getelementptr inbounds %struct.m.2.5.8.11, %struct.m.2.5.8.11* %this, i64 0, i32 2, i32 1, i64 0
+  %1 = load i32, i32* %arrayidx, align 4
+  %and.i = and i32 %1, 1
+  %or.i = or i32 %and.i, %shl.i
+  tail call void @_ZN1llsE1d(%struct.l.0.3.6.9* undef, i32 %or.i) #1
+  %bf.load10 = load i216, i216* %0, align 8
+  %bf.lshr11 = lshr i216 %bf.load10, 4
+  %shl.i1524 = shl i216 %bf.lshr11, 31
+  %shl.i15 = trunc i216 %shl.i1524 to i32
+  tail call void @_ZN1llsE1d(%struct.l.0.3.6.9* undef, i32 %shl.i15) #1
+  ret void
+}
+declare void @_ZN1llsE1d(%struct.l.0.3.6.9*, i32) local_unnamed_addr #0
diff --git a/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll b/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
index 6aaf169dabee4..dd41abd093d62 100644
--- a/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
+++ b/llvm/test/CodeGen/PowerPC/redundant-copy-after-tail-dup.ll
@@ -26,8 +26,7 @@ define dso_local i1 @t(%class.A* %this, i32 %color, i32 %vertex) local_unnamed_a
 ; CHECK-P9-NEXT:    cmplwi r3, 2
 ; CHECK-P9-NEXT:    bge- cr0, .LBB0_6
 ; CHECK-P9-NEXT:  # %bb.3: # %land.lhs.true.1
-; CHECK-P9-NEXT:    li r5, 0
-; CHECK-P9-NEXT:    mr r3, r5
+; CHECK-P9-NEXT:    li r3, 0
 ; CHECK-P9-NEXT:    blr
 ; CHECK-P9-NEXT:  .LBB0_4: # %lor.lhs.false
 ; CHECK-P9-NEXT:    cmplwi cr0, r4, 0
diff --git a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll
index 8fdcd1eac4505..7804b0a3f0979 100644
--- a/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll
+++ b/llvm/test/CodeGen/PowerPC/sms-cpy-1.ll
@@ -44,7 +44,6 @@ define void @print_res() nounwind {
 ; CHECK-NEXT:    lbz 5, 0(5)
 ; CHECK-NEXT:    addi 3, 3, 1
 ; CHECK-NEXT:    bdz .LBB0_4
-; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_3: #
 ; CHECK-NEXT:    clrldi 10, 8, 32
 ; CHECK-NEXT:    cntlzw 9, 6
diff --git a/llvm/test/CodeGen/RISCV/alu64.ll b/llvm/test/CodeGen/RISCV/alu64.ll
index dd336065ef6c0..1160b4055674d 100644
--- a/llvm/test/CodeGen/RISCV/alu64.ll
+++ b/llvm/test/CodeGen/RISCV/alu64.ll
@@ -224,8 +224,7 @@ define i64 @sll(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    srli a4, a0, 1
 ; RV32I-NEXT:    srl a3, a4, a3
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sll a2, a0, a2
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    sll a0, a0, a2
 ; RV32I-NEXT:    ret
   %1 = shl i64 %a, %b
   ret i64 %1
@@ -311,8 +310,7 @@ define i64 @srl(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    slli a4, a1, 1
 ; RV32I-NEXT:    sll a3, a4, a3
 ; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    srl a2, a1, a2
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    ret
   %1 = lshr i64 %a, %b
   ret i64 %1
diff --git a/llvm/test/CodeGen/RISCV/copysign-casts.ll b/llvm/test/CodeGen/RISCV/copysign-casts.ll
new file mode 100644
index 0000000000000..acd64c203657a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/copysign-casts.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+f \
+; RUN:   -target-abi ilp32f < %s | FileCheck %s -check-prefix=RV32IF
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+f -mattr=+d \
+; RUN:   -target-abi ilp32d < %s | FileCheck %s -check-prefix=RV32IFD
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+f -mattr=+d \
+; RUN:   -target-abi lp64d < %s | FileCheck %s -check-prefix=RV64IFD
+
+; Test fcopysign scenarios where the sign argument is casted to the type of the
+; magnitude argument. Those casts can be folded away by the DAGCombiner.
+
+declare double @llvm.copysign.f64(double, double)
+declare float @llvm.copysign.f32(float, float)
+
+define double @fold_promote(double %a, float %b) nounwind {
+; RV32I-LABEL: fold_promote:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a3, 524288
+; RV32I-NEXT:    and a2, a2, a3
+; RV32I-NEXT:    addi a3, a3, -1
+; RV32I-NEXT:    and a1, a1, a3
+; RV32I-NEXT:    or a1, a1, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fold_promote:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, zero, -1
+; RV64I-NEXT:    slli a2, a2, 63
+; RV64I-NEXT:    addi a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    addi a2, zero, 1
+; RV64I-NEXT:    slli a2, a2, 31
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32IF-LABEL: fold_promote:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.x.w a2, fa0
+; RV32IF-NEXT:    lui a3, 524288
+; RV32IF-NEXT:    and a2, a2, a3
+; RV32IF-NEXT:    addi a3, a3, -1
+; RV32IF-NEXT:    and a1, a1, a3
+; RV32IF-NEXT:    or a1, a1, a2
+; RV32IF-NEXT:    ret
+;
+; RV32IFD-LABEL: fold_promote:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    fcvt.d.s ft0, fa1
+; RV32IFD-NEXT:    fsgnj.d fa0, fa0, ft0
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: fold_promote:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    fcvt.d.s ft0, fa1
+; RV64IFD-NEXT:    fsgnj.d fa0, fa0, ft0
+; RV64IFD-NEXT:    ret
+  %c = fpext float %b to double
+  %t = call double @llvm.copysign.f64(double %a, double %c)
+  ret double %t
+}
+
+define float @fold_demote(float %a, double %b) nounwind {
+; RV32I-LABEL: fold_demote:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 524288
+; RV32I-NEXT:    and a2, a2, a1
+; RV32I-NEXT:    addi a1, a1, -1
+; RV32I-NEXT:    and a0, a0, a1
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: fold_demote:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    addi a2, zero, -1
+; RV64I-NEXT:    slli a2, a2, 63
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV32IF-LABEL: fold_demote:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    fmv.w.x ft0, a1
+; RV32IF-NEXT:    fsgnj.s fa0, fa0, ft0
+; RV32IF-NEXT:    ret
+;
+; RV32IFD-LABEL: fold_demote:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    fcvt.s.d ft0, fa1
+; RV32IFD-NEXT:    fsgnj.s fa0, fa0, ft0
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: fold_demote:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    fcvt.s.d ft0, fa1
+; RV64IFD-NEXT:    fsgnj.s fa0, fa0, ft0
+; RV64IFD-NEXT:    ret
+  %c = fptrunc double %b to float
+  %t = call float @llvm.copysign.f32(float %a, float %c)
+  ret float %t
+}
diff --git a/llvm/test/CodeGen/RISCV/shifts.ll b/llvm/test/CodeGen/RISCV/shifts.ll
index 54c49f3f3ef66..6e1575d9dc5e7 100644
--- a/llvm/test/CodeGen/RISCV/shifts.ll
+++ b/llvm/test/CodeGen/RISCV/shifts.ll
@@ -23,8 +23,7 @@ define i64 @lshr64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    slli a4, a1, 1
 ; RV32I-NEXT:    sll a3, a4, a3
 ; RV32I-NEXT:    or a0, a0, a3
-; RV32I-NEXT:    srl a2, a1, a2
-; RV32I-NEXT:    mv a1, a2
+; RV32I-NEXT:    srl a1, a1, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: lshr64:
@@ -114,8 +113,7 @@ define i64 @shl64(i64 %a, i64 %b) nounwind {
 ; RV32I-NEXT:    srli a4, a0, 1
 ; RV32I-NEXT:    srl a3, a4, a3
 ; RV32I-NEXT:    or a1, a1, a3
-; RV32I-NEXT:    sll a2, a0, a2
-; RV32I-NEXT:    mv a0, a2
+; RV32I-NEXT:    sll a0, a0, a2
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: shl64:
@@ -191,8 +189,7 @@ define i128 @lshr128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    slli a4, a1, 1
 ; RV64I-NEXT:    sll a3, a4, a3
 ; RV64I-NEXT:    or a0, a0, a3
-; RV64I-NEXT:    srl a2, a1, a2
-; RV64I-NEXT:    mv a1, a2
+; RV64I-NEXT:    srl a1, a1, a2
 ; RV64I-NEXT:    ret
   %1 = lshr i128 %a, %b
   ret i128 %1
@@ -298,8 +295,7 @@ define i128 @shl128(i128 %a, i128 %b) nounwind {
 ; RV64I-NEXT:    srli a4, a0, 1
 ; RV64I-NEXT:    srl a3, a4, a3
 ; RV64I-NEXT:    or a1, a1, a3
-; RV64I-NEXT:    sll a2, a0, a2
-; RV64I-NEXT:    mv a0, a2
+; RV64I-NEXT:    sll a0, a0, a2
 ; RV64I-NEXT:    ret
   %1 = shl i128 %a, %b
   ret i128 %1
diff --git a/llvm/test/CodeGen/RISCV/tls-models.ll b/llvm/test/CodeGen/RISCV/tls-models.ll
index a2015b086f95f..25a2f71beb317 100644
--- a/llvm/test/CodeGen/RISCV/tls-models.ll
+++ b/llvm/test/CodeGen/RISCV/tls-models.ll
@@ -3,16 +3,17 @@
 ; RUN:     | FileCheck -check-prefix=RV32-PIC %s
 ; RUN: llc -mtriple=riscv64 -relocation-model=pic < %s \
 ; RUN:     | FileCheck -check-prefix=RV64-PIC %s
-; RUN: llc -mtriple=riscv32 < %s | FileCheck -check-prefix=NOPIC %s
-; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=NOPIC %s
+; RUN: llc -mtriple=riscv32 < %s | FileCheck -check-prefix=RV32-NOPIC %s
+; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64-NOPIC %s
 
 ; Check that TLS symbols are lowered correctly based on the specified
-; model.
+; model. Make sure they're external to avoid them all being optimised to Local
+; Exec for the executable.
 
-@unspecified = thread_local global i32 42
-@ld = thread_local(localdynamic) global i32 42
-@ie = thread_local(initialexec) global i32 42
-@le = thread_local(localexec) global i32 42
+@unspecified = external thread_local global i32
+@ld = external thread_local(localdynamic) global i32
+@ie = external thread_local(initialexec) global i32
+@le = external thread_local(localexec) global i32
 
 
 ; No model specified
@@ -44,12 +45,23 @@ define i32* @f1() nounwind {
 ; RV64-PIC-NEXT:    addi sp, sp, 16
 ; RV64-PIC-NEXT:    ret
 ;
-; NOPIC-LABEL: f1:
-; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    lui a0, %tprel_hi(unspecified)
-; NOPIC-NEXT:    add a0, a0, tp, %tprel_add(unspecified)
-; NOPIC-NEXT:    addi a0, a0, %tprel_lo(unspecified)
-; NOPIC-NEXT:    ret
+; RV32-NOPIC-LABEL: f1:
+; RV32-NOPIC:       # %bb.0: # %entry
+; RV32-NOPIC-NEXT:  .LBB0_1: # %entry
+; RV32-NOPIC-NEXT:    # Label of block must be emitted
+; RV32-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(unspecified)
+; RV32-NOPIC-NEXT:    lw a0, %pcrel_lo(.LBB0_1)(a0)
+; RV32-NOPIC-NEXT:    add a0, a0, tp
+; RV32-NOPIC-NEXT:    ret
+;
+; RV64-NOPIC-LABEL: f1:
+; RV64-NOPIC:       # %bb.0: # %entry
+; RV64-NOPIC-NEXT:  .LBB0_1: # %entry
+; RV64-NOPIC-NEXT:    # Label of block must be emitted
+; RV64-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(unspecified)
+; RV64-NOPIC-NEXT:    ld a0, %pcrel_lo(.LBB0_1)(a0)
+; RV64-NOPIC-NEXT:    add a0, a0, tp
+; RV64-NOPIC-NEXT:    ret
 entry:
   ret i32* @unspecified
 }
@@ -84,12 +96,23 @@ define i32* @f2() nounwind {
 ; RV64-PIC-NEXT:    addi sp, sp, 16
 ; RV64-PIC-NEXT:    ret
 ;
-; NOPIC-LABEL: f2:
-; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    lui a0, %tprel_hi(ld)
-; NOPIC-NEXT:    add a0, a0, tp, %tprel_add(ld)
-; NOPIC-NEXT:    addi a0, a0, %tprel_lo(ld)
-; NOPIC-NEXT:    ret
+; RV32-NOPIC-LABEL: f2:
+; RV32-NOPIC:       # %bb.0: # %entry
+; RV32-NOPIC-NEXT:  .LBB1_1: # %entry
+; RV32-NOPIC-NEXT:    # Label of block must be emitted
+; RV32-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ld)
+; RV32-NOPIC-NEXT:    lw a0, %pcrel_lo(.LBB1_1)(a0)
+; RV32-NOPIC-NEXT:    add a0, a0, tp
+; RV32-NOPIC-NEXT:    ret
+;
+; RV64-NOPIC-LABEL: f2:
+; RV64-NOPIC:       # %bb.0: # %entry
+; RV64-NOPIC-NEXT:  .LBB1_1: # %entry
+; RV64-NOPIC-NEXT:    # Label of block must be emitted
+; RV64-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ld)
+; RV64-NOPIC-NEXT:    ld a0, %pcrel_lo(.LBB1_1)(a0)
+; RV64-NOPIC-NEXT:    add a0, a0, tp
+; RV64-NOPIC-NEXT:    ret
 entry:
   ret i32* @ld
 }
@@ -116,12 +139,23 @@ define i32* @f3() nounwind {
 ; RV64-PIC-NEXT:    add a0, a0, tp
 ; RV64-PIC-NEXT:    ret
 ;
-; NOPIC-LABEL: f3:
-; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    lui a0, %tprel_hi(ie)
-; NOPIC-NEXT:    add a0, a0, tp, %tprel_add(ie)
-; NOPIC-NEXT:    addi a0, a0, %tprel_lo(ie)
-; NOPIC-NEXT:    ret
+; RV32-NOPIC-LABEL: f3:
+; RV32-NOPIC:       # %bb.0: # %entry
+; RV32-NOPIC-NEXT:  .LBB2_1: # %entry
+; RV32-NOPIC-NEXT:    # Label of block must be emitted
+; RV32-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV32-NOPIC-NEXT:    lw a0, %pcrel_lo(.LBB2_1)(a0)
+; RV32-NOPIC-NEXT:    add a0, a0, tp
+; RV32-NOPIC-NEXT:    ret
+;
+; RV64-NOPIC-LABEL: f3:
+; RV64-NOPIC:       # %bb.0: # %entry
+; RV64-NOPIC-NEXT:  .LBB2_1: # %entry
+; RV64-NOPIC-NEXT:    # Label of block must be emitted
+; RV64-NOPIC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV64-NOPIC-NEXT:    ld a0, %pcrel_lo(.LBB2_1)(a0)
+; RV64-NOPIC-NEXT:    add a0, a0, tp
+; RV64-NOPIC-NEXT:    ret
 entry:
   ret i32* @ie
 }
@@ -144,12 +178,19 @@ define i32* @f4() nounwind {
 ; RV64-PIC-NEXT:    addi a0, a0, %tprel_lo(le)
 ; RV64-PIC-NEXT:    ret
 ;
-; NOPIC-LABEL: f4:
-; NOPIC:       # %bb.0: # %entry
-; NOPIC-NEXT:    lui a0, %tprel_hi(le)
-; NOPIC-NEXT:    add a0, a0, tp, %tprel_add(le)
-; NOPIC-NEXT:    addi a0, a0, %tprel_lo(le)
-; NOPIC-NEXT:    ret
+; RV32-NOPIC-LABEL: f4:
+; RV32-NOPIC:       # %bb.0: # %entry
+; RV32-NOPIC-NEXT:    lui a0, %tprel_hi(le)
+; RV32-NOPIC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV32-NOPIC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV32-NOPIC-NEXT:    ret
+;
+; RV64-NOPIC-LABEL: f4:
+; RV64-NOPIC:       # %bb.0: # %entry
+; RV64-NOPIC-NEXT:    lui a0, %tprel_hi(le)
+; RV64-NOPIC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV64-NOPIC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV64-NOPIC-NEXT:    ret
 entry:
   ret i32* @le
 }
diff --git a/llvm/test/CodeGen/SPARC/fp128.ll b/llvm/test/CodeGen/SPARC/fp128.ll
index 83912e0f211ee..1f5d2db661d93 100644
--- a/llvm/test/CodeGen/SPARC/fp128.ll
+++ b/llvm/test/CodeGen/SPARC/fp128.ll
@@ -1,8 +1,17 @@
 ; RUN: llc < %s -march=sparc -mattr=hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=BE
 ; RUN: llc < %s -march=sparcel -mattr=hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=HARD --check-prefix=EL
-; RUN: llc < %s -march=sparc -mattr=-hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=BE
+; RUN: llc < %s -march=sparc -mattr=-hard-quad-float -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=BE
 ; RUN: llc < %s -march=sparcel -mattr=-hard-quad-float | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT --check-prefix=EL
 
+; XFAIL: *
+; This test currently fails with expensive checks enabled, for more details see
+; https://bugs.llvm.org/show_bug.cgi?id=44091.
+; *** Bad machine code: Expected a register operand. ***
+; - function:    f128_compare
+; - basic block: %bb.0 entry (0x63f4028)
+; - instruction: CMPrr killed %21:intregs, 0, implicit-def $icc
+; - operand 1:   0
+; NB: When this is fixed the verifier should not be run by default in the CL above.
 
 ; CHECK-LABEL: f128_ops:
 ; CHECK:      ldd
diff --git a/llvm/test/CodeGen/SystemZ/fp-libcall.ll b/llvm/test/CodeGen/SystemZ/fp-libcall.ll
index 75250b811cba5..2df25aaf814c4 100644
--- a/llvm/test/CodeGen/SystemZ/fp-libcall.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-libcall.ll
@@ -233,6 +233,68 @@ define fp128 @f33(fp128 %x, fp128 %y) {
   ret fp128 %tmp
 }
 
+; Verify that "nnan" minnum/maxnum calls are transformed to
+; compare+select sequences instead of libcalls.
+define float @f34(float %x, float %y) {
+; CHECK-LABEL: f34:
+; CHECK: cebr %f0, %f2
+; CHECK: blr %r14
+; CHECK: ler %f0, %f2
+; CHECK: br %r14
+  %tmp = call nnan float @llvm.minnum.f32(float %x, float %y)
+  ret float %tmp
+}
+
+define double @f35(double %x, double %y) {
+; CHECK-LABEL: f35:
+; CHECK: cdbr %f0, %f2
+; CHECK: blr %r14
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %tmp = call nnan double @llvm.minnum.f64(double %x, double %y)
+  ret double %tmp
+}
+
+define fp128 @f36(fp128 %x, fp128 %y) {
+; CHECK-LABEL: f36:
+; CHECK: cxbr
+; CHECK: jl
+; CHECK: lxr
+; CHECK: br %r14
+  %tmp = call nnan fp128 @llvm.minnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %tmp
+}
+
+define float @f37(float %x, float %y) {
+; CHECK-LABEL: f37:
+; CHECK: cebr %f0, %f2
+; CHECK: bhr %r14
+; CHECK: ler %f0, %f2
+; CHECK: br %r14
+  %tmp = call nnan float @llvm.maxnum.f32(float %x, float %y)
+  ret float %tmp
+}
+
+define double @f38(double %x, double %y) {
+; CHECK-LABEL: f38:
+; CHECK: cdbr %f0, %f2
+; CHECK: bhr %r14
+; CHECK: ldr %f0, %f2
+; CHECK: br %r14
+  %tmp = call nnan double @llvm.maxnum.f64(double %x, double %y)
+  ret double %tmp
+}
+
+define fp128 @f39(fp128 %x, fp128 %y) {
+; CHECK-LABEL: f39:
+; CHECK: cxbr
+; CHECK: jh
+; CHECK: lxr
+; CHECK: br %r14
+  %tmp = call nnan fp128 @llvm.maxnum.f128(fp128 %x, fp128 %y)
+  ret fp128 %tmp
+}
+
 declare float @llvm.powi.f32(float, i32)
 declare double @llvm.powi.f64(double, i32)
 declare fp128 @llvm.powi.f128(fp128, i32)
diff --git a/llvm/test/CodeGen/Thumb/callee_save_reserved.ll b/llvm/test/CodeGen/Thumb/callee_save_reserved.ll
deleted file mode 100644
index 0329d7886a2a9..0000000000000
--- a/llvm/test/CodeGen/Thumb/callee_save_reserved.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-; RUN: llc < %s -mtriple=thumbv6m-none-eabi -verify-machineinstrs -frame-pointer=none -mattr=+reserve-r6,+reserve-r8 \
-; RUN:     -asm-verbose=false | FileCheck --check-prefix=CHECK-INVALID %s
-
-; Reserved low registers should not be used to correct reg deficit.
-define <4 x i32> @four_high_four_return_reserved() {
-entry:
-  ; CHECK-INVALID-NOT: r{{6|8}}
-  tail call void asm sideeffect "", "~{r8},~{r9}"()
-  %vecinit = insertelement <4 x i32> undef, i32 1, i32 0
-  %vecinit11 = insertelement <4 x i32> %vecinit, i32 2, i32 1
-  %vecinit12 = insertelement <4 x i32> %vecinit11, i32 3, i32 2
-  %vecinit13 = insertelement <4 x i32> %vecinit12, i32 4, i32 3
-  ret <4 x i32> %vecinit13
-}
-
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index 79c81ca7a449c..257d950c60fb3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -4,7 +4,7 @@
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
@@ -57,7 +57,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@@ -109,7 +109,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK-LABEL: mul_v4i32
 ; CHECK: vector.body:
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@@ -158,59 +158,11 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
-; CHECK-LABEL: copy_v2i64
-; CHECK: vector.body:
-; CHECK: %index = phi i32
-; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
-; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
-; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
-; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
-define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
-entry:
-  %cmp8 = icmp eq i32 %N, 0
-  %tmp8 = add i32 %N, 1
-  %tmp9 = lshr i32 %tmp8, 1
-  %tmp10 = shl nuw i32 %tmp9, 1
-  %tmp11 = add i32 %tmp10, -2
-  %tmp12 = lshr i32 %tmp11, 1
-  %tmp13 = add nuw nsw i32 %tmp12, 1
-  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
-
-vector.ph:                                        ; preds = %entry
-  %trip.count.minus.1 = add i32 %N, -1
-  %broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0
-  %broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer
-  call void @llvm.set.loop.iterations.i32(i32 %tmp13)
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
-  %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
-  %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
-  %tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11
-  %tmp = getelementptr inbounds i64, i64* %a, i32 %index
-  %tmp2 = bitcast i64* %tmp to <2 x i64>*
-  %wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef)
-  %tmp3 = getelementptr inbounds i64, i64* %b, i32 %index
-  %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
-  tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1)
-  %index.next = add i32 %index, 2
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
-  %tmp16 = icmp ne i32 %tmp15, 0
-  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %vector.body, %entry
-  ret void
-}
-
 ; CHECK-LABEL: split_vector
 ; CHECK: vector.body:
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@@ -268,7 +220,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; One of the loads now uses ult predicate.
 ; CHECK-LABEL: mismatch_load_pred
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
@@ -322,7 +274,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 ; CHECK-LABEL: mismatch_store_pred
 ; CHECK: %index = phi i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
index 5900dd9ac66a9..0b50b9a1db4e9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll
@@ -154,11 +154,11 @@ for.cond.cleanup:                                 ; preds = %middle.block, %entr
 ; CHECK-NEXT: vldrwt.u32
 ; CHECK-NEXT: vldrwt.u32
 ; CHECK:      mov [[ELEMS_OUT:r[0-9]+]], [[ELEMS]]
+; CHECK:      sub{{.*}} [[ELEMS]],{{.*}}#4
 ; CHECK:      vpsttt
 ; CHECK-NEXT: vcmpt.i32	eq, {{.*}}, zr
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r3]
 ; CHECK-NEXT: vldrwt.u32 q{{.*}}, [r2]
-; CHECK:      sub{{.*}} [[ELEMS]],{{.*}}#4
 ; CHECK:      le lr, [[LOOP]]
 ; CHECK:      vctp.32 [[ELEMS_OUT]]
 ; CHECK:      vpsel
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index 02d05ef9c0f61..f7c9236c6e62f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -36,21 +36,14 @@ define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocaptur
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB0_8
 ; CHECK-NEXT:  .LBB0_4: @ %vector.ph
-; CHECK-NEXT:    adds r6, r3, #3
-; CHECK-NEXT:    bic r6, r6, #3
-; CHECK-NEXT:    subs r6, #4
-; CHECK-NEXT:    add.w lr, r12, r6, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB0_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vldrw.u32 q1, [r2]
-; CHECK-NEXT:    vmul.f32 q0, q1, q0
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    adds r1, #16
-; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vmul.f32 q0, q1, q0
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB0_5
 ; CHECK-NEXT:    b .LBB0_11
 ; CHECK-NEXT:  .LBB0_6: @ %for.body.preheader.new
@@ -240,13 +233,11 @@ define arm_aapcs_vfpcc float @fast_float_mac(float* nocapture readonly %b, float
 ; CHECK-NEXT:  .LBB1_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vctp.32 r2
-; CHECK-NEXT:    vpstt
-; CHECK-NEXT:    vldrwt.u32 q2, [r0]
-; CHECK-NEXT:    vldrwt.u32 q3, [r1]
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    adds r0, #16
-; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vpstt
+; CHECK-NEXT:    vldrwt.u32 q2, [r0], #16
+; CHECK-NEXT:    vldrwt.u32 q3, [r1], #16
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vfma.f32 q0, q3, q2
 ; CHECK-NEXT:    le lr, .LBB1_2
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
index 38e688bbf6288..23c447284293f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll
@@ -16,17 +16,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture re
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r12, r2
 ; CHECK-NEXT:    adds r2, r1, r3
-; CHECK-NEXT:    vldrb.u32 q2, [r2]
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u32 q2, [r2]
 ; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    sub.w r2, r12, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
-; CHECK-NEXT:    letp lr, .LBB0_1
+; CHECK-NEXT:    le lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 ; CHECK-NEXT:    vctp.32 r12
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -82,19 +84,13 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.s32 q2, [r1]
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vldrh.s32 q2, [r1], #8
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
 ; CHECK-NEXT:    letp lr, .LBB1_1
@@ -160,17 +156,19 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture r
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
 ; CHECK-NEXT:    movs r3, #0
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dls lr, lr
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r12, r2
 ; CHECK-NEXT:    adds r2, r1, r3
-; CHECK-NEXT:    vldrb.u32 q2, [r2]
+; CHECK-NEXT:    vctp.32 r12
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vldrbt.u32 q2, [r2]
 ; CHECK-NEXT:    adds r3, #4
 ; CHECK-NEXT:    sub.w r2, r12, #4
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
-; CHECK-NEXT:    letp lr, .LBB2_1
+; CHECK-NEXT:    le lr, .LBB2_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 ; CHECK-NEXT:    vctp.32 r12
 ; CHECK-NEXT:    vpsel q0, q0, q1
@@ -226,19 +224,13 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocaptur
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u32 q2, [r1]
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    adds r1, #8
 ; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vldrh.u32 q2, [r1], #8
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
 ; CHECK-NEXT:    letp lr, .LBB3_1
@@ -297,19 +289,13 @@ define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r3, r2, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    adds r1, #16
 ; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    vmov q1, q0
 ; CHECK-NEXT:    vmla.u32 q0, q2, r0
 ; CHECK-NEXT:    letp lr, .LBB4_1
@@ -392,13 +378,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    b .LBB5_9
 ; CHECK-NEXT:  .LBB5_4: @ %vector.ph
-; CHECK-NEXT:    add.w r7, r12, #3
-; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    bic r7, r7, #3
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    subs r7, #4
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB5_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r5, r0, r4
@@ -406,11 +387,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
 ; CHECK-NEXT:    adds r5, r1, r4
 ; CHECK-NEXT:    vldrb.u32 q1, [r5]
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    adds r3, #16
 ; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB5_5
 ; CHECK-NEXT:    b .LBB5_12
 ; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader.new
@@ -607,23 +587,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readon
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
-; CHECK-NEXT:    add.w lr, r12, #3
-; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    bic lr, lr, #3
-; CHECK-NEXT:    sub.w lr, lr, #4
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.s32 q0, [r0]
-; CHECK-NEXT:    vldrh.s32 q1, [r1]
+; CHECK-NEXT:    vldrh.s32 q0, [r0], #8
+; CHECK-NEXT:    vldrh.s32 q1, [r1], #8
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    adds r3, #16
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB6_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -703,13 +675,8 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    movs r7, #0
 ; CHECK-NEXT:    b .LBB7_9
 ; CHECK-NEXT:  .LBB7_4: @ %vector.ph
-; CHECK-NEXT:    add.w r7, r12, #3
-; CHECK-NEXT:    movs r6, #1
-; CHECK-NEXT:    bic r7, r7, #3
 ; CHECK-NEXT:    movs r4, #0
-; CHECK-NEXT:    subs r7, #4
-; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB7_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    adds r5, r0, r4
@@ -717,11 +684,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
 ; CHECK-NEXT:    adds r5, r1, r4
 ; CHECK-NEXT:    vldrb.u32 q1, [r5]
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    adds r3, #16
 ; CHECK-NEXT:    adds r4, #4
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB7_5
 ; CHECK-NEXT:    b .LBB7_12
 ; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader.new
@@ -918,23 +884,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture reado
 ; CHECK-NEXT:    cmp.w r12, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
-; CHECK-NEXT:    add.w lr, r12, #3
-; CHECK-NEXT:    movs r4, #1
-; CHECK-NEXT:    bic lr, lr, #3
-; CHECK-NEXT:    sub.w lr, lr, #4
-; CHECK-NEXT:    add.w lr, r4, lr, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB8_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u32 q0, [r0]
-; CHECK-NEXT:    vldrh.u32 q1, [r1]
+; CHECK-NEXT:    vldrh.u32 q0, [r0], #8
+; CHECK-NEXT:    vldrh.u32 q1, [r1], #8
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    adds r0, #8
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    adds r1, #8
-; CHECK-NEXT:    adds r3, #16
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB8_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
@@ -1016,22 +974,15 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly
 ; CHECK-NEXT:    mov.w r12, #0
 ; CHECK-NEXT:    b .LBB9_8
 ; CHECK-NEXT:  .LBB9_4: @ %vector.ph
-; CHECK-NEXT:    add.w r4, r12, #3
-; CHECK-NEXT:    bic r4, r4, #3
-; CHECK-NEXT:    subs r4, #4
-; CHECK-NEXT:    add.w lr, lr, r4, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r12
 ; CHECK-NEXT:  .LBB9_5: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q1, [r1]
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
 ; CHECK-NEXT:    vmul.i32 q0, q1, q0
-; CHECK-NEXT:    adds r0, #16
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r3]
-; CHECK-NEXT:    adds r1, #16
-; CHECK-NEXT:    adds r3, #16
 ; CHECK-NEXT:    sub.w r12, r12, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r3], #16
 ; CHECK-NEXT:    letp lr, .LBB9_5
 ; CHECK-NEXT:    b .LBB9_11
 ; CHECK-NEXT:  .LBB9_6: @ %for.body.preheader.new
@@ -1217,24 +1168,18 @@ define dso_local arm_aapcs_vfpcc void @test_v8i8_to_v8i16(i16* noalias nocapture
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
-; CHECK-NEXT:    add.w r12, r3, #7
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #7
-; CHECK-NEXT:    sub.w r12, r12, #8
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.16 lr, lr
+; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB10_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r4, r1, r12
 ; CHECK-NEXT:    vldrb.u16 q0, [r4]
 ; CHECK-NEXT:    add.w r4, r2, r12
-; CHECK-NEXT:    vldrb.u16 q1, [r4]
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vstrh.16 q0, [r0]
-; CHECK-NEXT:    adds r0, #16
 ; CHECK-NEXT:    add.w r12, r12, #8
 ; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vldrb.u16 q1, [r4]
+; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB10_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r4, pc}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
index 2f9d301e8086d..f67a59f74fb80 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -28,7 +28,7 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
@@ -140,7 +140,7 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
 ; CHECK-NEXT:    [[TMP2]] = sub i32 [[TMP0]], 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/out-of-range-cbz.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/out-of-range-cbz.mir
new file mode 100644
index 0000000000000..c5a38ea13454f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/out-of-range-cbz.mir
@@ -0,0 +1,451 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-cp-islands %s -o - | FileCheck %s
+--- |
+  @d = hidden local_unnamed_addr global i32 0, align 4
+  @a = hidden global i32 0, align 4
+  @e = hidden local_unnamed_addr global i32 0, align 4
+
+  define hidden void @f(i64 %g) {
+  entry:
+    %conv = trunc i64 %g to i32
+    %tobool5 = icmp eq i64 %g, 0
+    br i1 %tobool5, label %j.us.us.preheader, label %entry.split
+
+  j.us.us.preheader:                                ; preds = %entry
+    %.pre59 = load i32, i32* @d, align 4
+    br label %j.us.us
+
+  j.us.us:                                          ; preds = %j.us.us, %if.end.us.us.us, %if.end.us.us.us.1, %if.end.us.us.us.2, %if.end.us.us.us.3, %if.end.us.us.us.4, %if.end.us.us.us.5, %if.end.us.us.us.6, %j.us.us.preheader
+    %0 = phi i32 [ %.pre59, %j.us.us.preheader ], [ %12, %if.end.us.us.us.6 ], [ %11, %if.end.us.us.us.5 ], [ %10, %if.end.us.us.us.4 ], [ %9, %if.end.us.us.us.3 ], [ %8, %if.end.us.us.us.2 ], [ %7, %if.end.us.us.us.1 ], [ %2, %if.end.us.us.us ], [ %0, %j.us.us ]
+    %cmp.us.us = icmp slt i32 %0, ptrtoint (i32* @a to i32)
+    %conv1.us.us = zext i1 %cmp.us.us to i32
+    %1 = load i32, i32* @e, align 4
+    %and.us.us = and i32 %1, %conv1.us.us
+    store i32 %and.us.us, i32* @e, align 4
+    %tobool4.us.us.us = icmp eq i32 %0, 0
+    br i1 %tobool4.us.us.us, label %if.end.us.us.us, label %j.us.us
+
+  if.end.us.us.us:                                  ; preds = %j.us.us
+    tail call void asm sideeffect "", ""()
+    %2 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.1 = icmp eq i32 %2, 0
+    br i1 %tobool4.us.us.us.1, label %if.end.us.us.us.1, label %j.us.us
+
+  entry.split:                                      ; preds = %entry
+    %tobool = icmp eq i32 %conv, 0
+    br i1 %tobool, label %j.us27.preheader, label %j.preheader
+
+  j.preheader:                                      ; preds = %entry.split
+    %.pre = load i32, i32* @e, align 4
+    %.pre55 = load i32, i32* @d, align 4
+    %cmp = icmp slt i32 %conv, ptrtoint (i32* @a to i32)
+    %conv1 = zext i1 %cmp to i32
+    br label %j
+
+  j.us27.preheader:                                 ; preds = %entry.split
+    %.pre56 = load i32, i32* @d, align 4
+    %.pre57 = load i32, i32* @e, align 4
+    %cmp.us29 = icmp slt i32 %.pre56, ptrtoint (i32* @a to i32)
+    %conv1.us30 = zext i1 %cmp.us29 to i32
+    br label %j.us27
+
+  j.us27:                                           ; preds = %j.us27, %j.us27.preheader
+    %3 = phi i32 [ %.pre57, %j.us27.preheader ], [ %and.us31, %j.us27 ]
+    %4 = icmp eq i32 %.pre56, 0
+    %and.us31 = and i32 %3, %conv1.us30
+    br i1 %4, label %if.end.us38, label %j.us27
+
+  if.end.us38:                                      ; preds = %j.us27
+    store i32 %and.us31, i32* @e, align 4
+    tail call void asm sideeffect "", ""()
+    ret void
+
+  j:                                                ; preds = %j, %j.preheader
+    %5 = phi i32 [ %.pre, %j.preheader ], [ %and, %j ]
+    %6 = icmp eq i32 %.pre55, 0
+    %and = and i32 %5, %conv1
+    br i1 %6, label %if.end, label %j
+
+  if.end:                                           ; preds = %j
+    store i32 %and, i32* @e, align 4
+    tail call void asm sideeffect "", ""()
+    ret void
+
+  if.end.us.us.us.1:                                ; preds = %if.end.us.us.us
+    tail call void asm sideeffect "", ""()
+    %7 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.2 = icmp eq i32 %7, 0
+    br i1 %tobool4.us.us.us.2, label %if.end.us.us.us.2, label %j.us.us
+
+  if.end.us.us.us.2:                                ; preds = %if.end.us.us.us.1
+    tail call void asm sideeffect "", ""()
+    %8 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.3 = icmp eq i32 %8, 0
+    br i1 %tobool4.us.us.us.3, label %if.end.us.us.us.3, label %j.us.us
+
+  if.end.us.us.us.3:                                ; preds = %if.end.us.us.us.2
+    tail call void asm sideeffect "", ""()
+    %9 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.4 = icmp eq i32 %9, 0
+    br i1 %tobool4.us.us.us.4, label %if.end.us.us.us.4, label %j.us.us
+
+  if.end.us.us.us.4:                                ; preds = %if.end.us.us.us.3
+    tail call void asm sideeffect "", ""()
+    %10 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.5 = icmp eq i32 %10, 0
+    br i1 %tobool4.us.us.us.5, label %if.end.us.us.us.5, label %j.us.us
+
+  if.end.us.us.us.5:                                ; preds = %if.end.us.us.us.4
+    tail call void asm sideeffect "", ""()
+    %11 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.6 = icmp eq i32 %11, 0
+    br i1 %tobool4.us.us.us.6, label %if.end.us.us.us.6, label %j.us.us
+
+  if.end.us.us.us.6:                                ; preds = %if.end.us.us.us.5
+    tail call void asm sideeffect "", ""()
+    %12 = load i32, i32* @d, align 4
+    %tobool4.us.us.us.7 = icmp eq i32 %12, 0
+    br i1 %tobool4.us.us.us.7, label %if.end.us.us.us.7, label %j.us.us
+
+  if.end.us.us.us.7:                                ; preds = %if.end.us.us.us.6
+    tail call void asm sideeffect "", ""()
+    ret void
+  }
+
+...
+---
+name:            f
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: f
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.5(0x30000000), %bb.1(0x50000000)
+  ; CHECK:   liveins: $r0, $r1, $r7, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   dead renamable $r1, $cpsr = tORR killed renamable $r1, renamable $r0, 14, $noreg
+  ; CHECK:   tBcc %bb.5, 0, killed $cpsr
+  ; CHECK: bb.1.entry.split:
+  ; CHECK:   successors: %bb.15(0x30000000), %bb.2(0x50000000)
+  ; CHECK:   liveins: $r0
+  ; CHECK:   tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   tBcc %bb.15, 0, killed $cpsr
+  ; CHECK: bb.2.j.preheader:
+  ; CHECK:   successors: %bb.3(0x80000000)
+  ; CHECK:   liveins: $r0
+  ; CHECK:   $r1 = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+  ; CHECK:   $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @a, 14, $noreg
+  ; CHECK:   tCMPr killed renamable $r0, killed renamable $r1, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   $r1 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+  ; CHECK:   renamable $r0 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+  ; CHECK:   $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @d, 14, $noreg
+  ; CHECK:   renamable $r2 = tLDRi killed renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   $r1 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+  ; CHECK:   $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @e, 14, $noreg
+  ; CHECK:   renamable $r3 = tLDRi renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+  ; CHECK: bb.3.j (align 4):
+  ; CHECK:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r3, dead $cpsr = tAND killed renamable $r3, renamable $r0, 14, $noreg
+  ; CHECK:   tCBZ $r2, %bb.4
+  ; CHECK: bb.4.if.end:
+  ; CHECK:   liveins: $r1, $r3
+  ; CHECK:   tSTRi killed renamable $r3, killed renamable $r1, 0, 14, $noreg :: (store 4 into @e)
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  ; CHECK: bb.5.j.us.us.preheader:
+  ; CHECK:   successors: %bb.6(0x80000000)
+  ; CHECK:   $r12 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+  ; CHECK:   $lr = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+  ; CHECK:   $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @d, 14, $noreg
+  ; CHECK:   $r2 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   $lr = t2MOVTi16 killed $lr, target-flags(arm-hi16) @a, 14, $noreg
+  ; CHECK:   $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @e, 14, $noreg
+  ; CHECK: bb.6.j.us.us (align 4):
+  ; CHECK:   successors: %bb.7(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r3, $r12
+  ; CHECK:   tCMPhir renamable $r3, renamable $lr, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   renamable $r1 = tLDRi renamable $r2, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+  ; CHECK:   renamable $r0 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+  ; CHECK:   renamable $r0 = t2ANDrr killed renamable $r0, killed renamable $r1, 14, $noreg, $noreg
+  ; CHECK:   tSTRi killed renamable $r0, renamable $r2, 0, 14, $noreg :: (store 4 into @e)
+  ; CHECK:   tCBZ $r3, %bb.7
+  ; CHECK: bb.7.if.end.us.us.us:
+  ; CHECK:   successors: %bb.8(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.8
+  ; CHECK: bb.8.if.end.us.us.us.1:
+  ; CHECK:   successors: %bb.9(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.9
+  ; CHECK: bb.9.if.end.us.us.us.2:
+  ; CHECK:   successors: %bb.10(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.10
+  ; CHECK: bb.10.if.end.us.us.us.3:
+  ; CHECK:   successors: %bb.11(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.11
+  ; CHECK: bb.11.if.end.us.us.us.4:
+  ; CHECK:   successors: %bb.12(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.12
+  ; CHECK: bb.12.if.end.us.us.us.5:
+  ; CHECK:   successors: %bb.13(0x40000000), %bb.6(0x40000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.13
+  ; CHECK: bb.13.if.end.us.us.us.6:
+  ; CHECK:   successors: %bb.14(0x04000000), %bb.6(0x7c000000)
+  ; CHECK:   liveins: $lr, $r2, $r12
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCBZ $r3, %bb.14
+  ; CHECK: bb.14.if.end.us.us.us.7:
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  ; CHECK: bb.15.j.us27.preheader:
+  ; CHECK:   successors: %bb.16(0x80000000)
+  ; CHECK:   $r0 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+  ; CHECK:   $r1 = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+  ; CHECK:   $r0 = t2MOVTi16 killed $r0, target-flags(arm-hi16) @d, 14, $noreg
+  ; CHECK:   $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @a, 14, $noreg
+  ; CHECK:   renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+  ; CHECK:   tCMPr renamable $r0, killed renamable $r1, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   $r1 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+  ; CHECK:   $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @e, 14, $noreg
+  ; CHECK:   renamable $r2 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+  ; CHECK:   renamable $r3 = tLDRi renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+  ; CHECK: bb.16.j.us27 (align 4):
+  ; CHECK:   successors: %bb.17(0x04000000), %bb.16(0x7c000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   renamable $r3, dead $cpsr = tAND killed renamable $r3, renamable $r2, 14, $noreg
+  ; CHECK:   tCBZ $r0, %bb.17
+  ; CHECK: bb.17.if.end.us38:
+  ; CHECK:   liveins: $r1, $r3
+  ; CHECK:   tSTRi killed renamable $r3, killed renamable $r1, 0, 14, $noreg :: (store 4 into @e)
+  ; CHECK:   INLINEASM &"", 1
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
+  bb.0.entry:
+    successors: %bb.1(0x30000000), %bb.11(0x50000000)
+    liveins: $r0, $r1, $r7, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    dead renamable $r1, $cpsr = tORR killed renamable $r1, renamable $r0, 14, $noreg
+    t2Bcc %bb.1, 0, killed $cpsr
+
+  bb.11.entry.split:
+    successors: %bb.15(0x30000000), %bb.12(0x50000000)
+    liveins: $r0
+
+    tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.15, 0, killed $cpsr
+
+  bb.12.j.preheader:
+    successors: %bb.13(0x80000000)
+    liveins: $r0
+
+    $r1 = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+    $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @a, 14, $noreg
+    tCMPr killed renamable $r0, killed renamable $r1, 14, $noreg, implicit-def $cpsr
+    $r1 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+    renamable $r0 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+    $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @d, 14, $noreg
+    renamable $r2 = tLDRi killed renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    $r1 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+    $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @e, 14, $noreg
+    renamable $r3 = tLDRi renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+
+  bb.13.j (align 4):
+    successors: %bb.14(0x04000000), %bb.13(0x7c000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    renamable $r3, dead $cpsr = tAND killed renamable $r3, renamable $r0, 14, $noreg
+    tCMPi8 renamable $r2, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.13, 1, killed $cpsr
+
+  bb.14.if.end:
+    liveins: $r1, $r3
+
+    tSTRi killed renamable $r3, killed renamable $r1, 0, 14, $noreg :: (store 4 into @e)
+    INLINEASM &"", 1
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+  bb.1.j.us.us.preheader:
+    successors: %bb.2(0x80000000)
+
+    $r12 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+    $lr = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+    $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @d, 14, $noreg
+    $r2 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    $lr = t2MOVTi16 killed $lr, target-flags(arm-hi16) @a, 14, $noreg
+    $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @e, 14, $noreg
+
+  bb.2.j.us.us (align 4):
+    successors: %bb.3(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r3, $r12
+
+    tCMPhir renamable $r3, renamable $lr, 14, $noreg, implicit-def $cpsr
+    renamable $r1 = tLDRi renamable $r2, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+    renamable $r0 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    renamable $r0 = t2ANDrr killed renamable $r0, killed renamable $r1, 14, $noreg, $noreg
+    tSTRi killed renamable $r0, renamable $r2, 0, 14, $noreg :: (store 4 into @e)
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.3.if.end.us.us.us:
+    successors: %bb.4(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.4.if.end.us.us.us.1:
+    successors: %bb.5(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.5.if.end.us.us.us.2:
+    successors: %bb.6(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.6.if.end.us.us.us.3:
+    successors: %bb.7(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.7.if.end.us.us.us.4:
+    successors: %bb.8(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.8.if.end.us.us.us.5:
+    successors: %bb.9(0x40000000), %bb.2(0x40000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.9.if.end.us.us.us.6:
+    successors: %bb.10(0x04000000), %bb.2(0x7c000000)
+    liveins: $lr, $r2, $r12
+
+    INLINEASM &"", 1
+    renamable $r3 = t2LDRi12 renamable $r12, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPi8 renamable $r3, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.2, 1, killed $cpsr
+
+  bb.10.if.end.us.us.us.7:
+    INLINEASM &"", 1
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+  bb.15.j.us27.preheader:
+    successors: %bb.16(0x80000000)
+
+    $r0 = t2MOVi16 target-flags(arm-lo16) @d, 14, $noreg
+    $r1 = t2MOVi16 target-flags(arm-lo16) @a, 14, $noreg
+    $r0 = t2MOVTi16 killed $r0, target-flags(arm-hi16) @d, 14, $noreg
+    $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @a, 14, $noreg
+    renamable $r0 = tLDRi killed renamable $r0, 0, 14, $noreg :: (dereferenceable load 4 from @d)
+    tCMPr renamable $r0, killed renamable $r1, 14, $noreg, implicit-def $cpsr
+    $r1 = t2MOVi16 target-flags(arm-lo16) @e, 14, $noreg
+    $r1 = t2MOVTi16 killed $r1, target-flags(arm-hi16) @e, 14, $noreg
+    renamable $r2 = t2CSINC $zr, $zr, 10, implicit killed $cpsr
+    renamable $r3 = tLDRi renamable $r1, 0, 14, $noreg :: (dereferenceable load 4 from @e)
+
+  bb.16.j.us27 (align 4):
+    successors: %bb.17(0x04000000), %bb.16(0x7c000000)
+    liveins: $r0, $r1, $r2, $r3
+
+    renamable $r3, dead $cpsr = tAND killed renamable $r3, renamable $r2, 14, $noreg
+    tCMPi8 renamable $r0, 0, 14, $noreg, implicit-def $cpsr
+    t2Bcc %bb.16, 1, killed $cpsr
+
+  bb.17.if.end.us38:
+    liveins: $r1, $r3
+
+    tSTRi killed renamable $r3, killed renamable $r1, 0, 14, $noreg :: (store 4 into @e)
+    INLINEASM &"", 1
+    tPOP_RET 14, $noreg, def $r7, def $pc
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
index 70e272ffc0dce..330c6db24a74c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
 
 ; CHECK-LABEL: expand_v8i16_v8i32
-; CHECK-NOT: call i32 @llvm.arm.vctp
+; CHECK-NOT: call i32 @llvm.arm.mve.vctp
 define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
@@ -50,7 +50,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 
 ; CHECK-LABEL: expand_v8i16_v4i32
 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
 ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
 ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
@@ -117,7 +117,7 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
 }
 
 ; CHECK-LABEL: expand_v4i32_v4i64
-; CHECK-NOT: call i32 @llvm.arm.vctp
+; CHECK-NOT: call i32 @llvm.arm.mve.vctp
 define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
 entry:
   %cmp8 = icmp eq i32 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
index 7cdd28fd0f3cf..c7ed9ce674dd4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -5,7 +5,7 @@
 ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
 ; CHECK: phi i32
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@@ -63,7 +63,7 @@ middle.block:                                     ; preds = %vector.body
 ; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
 ; CHECK: phi i32
 ; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
+; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
 ; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
 define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
new file mode 100644
index 0000000000000..2ccb8da48d841
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-def.mir
@@ -0,0 +1,153 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops -verify-machineinstrs %s -o - | FileCheck %s
+# Check that subs isn't used during the revert because there's a def after LoopDec.
+
+--- |
+  define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+  entry:
+    %scevgep = getelementptr i32, i32* %q, i32 -1
+    %scevgep3 = getelementptr i32, i32* %p, i32 -1
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %limit = lshr i32 %n, 1
+    br label %while.body
+
+  while.body:                                       ; preds = %while.body, %entry
+    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
+    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
+    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
+    %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
+    %tmp1 = load i32, i32* %scevgep7, align 4
+    %tmp2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp, i32 1)
+    %half = lshr i32 %tmp1, 1
+    %cmp = icmp ult i32 %tmp, %limit
+    %res = select i1 %cmp, i32 %tmp1, i32 %half
+    store i32 %res, i32* %scevgep4, align 4
+    %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1
+    %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1
+    %tmp3 = icmp ne i32 %tmp2, 0
+    br i1 %tmp3, label %while.body, label %while.end
+
+  while.end:                                        ; preds = %while.body
+    ret i32 0
+  }
+
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #0
+
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  attributes #0 = { noduplicate nounwind }
+  attributes #1 = { nounwind }
+
+...
+---
+name:            do_copy
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: do_copy
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   $lr = tMOVr killed $r0, 14, $noreg
+  ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
+  ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
+  ; CHECK:   renamable $r2 = t2LSRri renamable $lr, 1, 14, $noreg, $noreg
+  ; CHECK: bb.1.while.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
+  ; CHECK:   tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, $noreg
+  ; CHECK:   t2IT 2, 8, implicit-def $itstate
+  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
+  ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+  ; CHECK:   t2CMPri renamable $lr, 0, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   tBcc %bb.1, 4, killed $cpsr
+  ; CHECK:   tB %bb.2, 14, $noreg
+  ; CHECK: bb.2.while.end:
+  ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $lr = tMOVr killed $r0, 14, $noreg
+    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
+    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
+    renamable $r2 = t2LSRri renamable $lr, 1, 14, $noreg, $noreg
+    t2DoLoopStart renamable $lr
+
+  bb.1.while.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $r0, $r1, $r2
+
+    renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
+    tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2IT 2, 8, implicit-def $itstate
+    renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
+    early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+    t2CMPri renamable $lr, 0, 14, $noreg, implicit-def $cpsr
+    tBcc %bb.1, 4, killed $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.while.end:
+    $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+    tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
new file mode 100644
index 0000000000000..c052e22d217d6
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@@ -0,0 +1,152 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=thumbv8.1m.main -run-pass=arm-low-overhead-loops -verify-machineinstrs %s -o - | FileCheck %s
+# Check that subs isn't used during the revert because there's a cpsr use after it.
+
+--- |
+  define i32 @do_copy(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) {
+  entry:
+    %scevgep = getelementptr i32, i32* %q, i32 -1
+    %scevgep3 = getelementptr i32, i32* %p, i32 -1
+    call void @llvm.set.loop.iterations.i32(i32 %n)
+    %limit = lshr i32 %n, 1
+    br label %while.body
+
+  while.body:                                       ; preds = %while.body, %entry
+    %lsr.iv4 = phi i32* [ %scevgep5, %while.body ], [ %scevgep3, %entry ]
+    %lsr.iv = phi i32* [ %scevgep1, %while.body ], [ %scevgep, %entry ]
+    %tmp = phi i32 [ %n, %entry ], [ %tmp2, %while.body ]
+    %scevgep7 = getelementptr i32, i32* %lsr.iv, i32 1
+    %scevgep4 = getelementptr i32, i32* %lsr.iv4, i32 1
+    %tmp1 = load i32, i32* %scevgep7, align 4
+    %tmp2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp, i32 1)
+    %half = lshr i32 %tmp1, 1
+    %cmp = icmp ult i32 %tmp, %limit
+    %res = select i1 %cmp, i32 %tmp1, i32 %half
+    store i32 %res, i32* %scevgep4, align 4
+    %scevgep1 = getelementptr i32, i32* %lsr.iv, i32 1
+    %scevgep5 = getelementptr i32, i32* %lsr.iv4, i32 1
+    %tmp3 = icmp ne i32 %tmp2, 0
+    br i1 %tmp3, label %while.body, label %while.end
+
+  while.end:                                        ; preds = %while.body
+    ret i32 0
+  }
+
+  ; Function Attrs: noduplicate nounwind
+  declare void @llvm.set.loop.iterations.i32(i32) #0
+
+  ; Function Attrs: noduplicate nounwind
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0
+
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+
+  attributes #0 = { noduplicate nounwind }
+  attributes #1 = { nounwind }
+
+...
+---
+name:            do_copy
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$r2', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  ; CHECK-LABEL: name: do_copy
+  ; CHECK: bb.0.entry:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
+  ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
+  ; CHECK:   renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
+  ; CHECK:   $lr = tMOVr killed $r0, 14, $noreg
+  ; CHECK: bb.1.while.body:
+  ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
+  ; CHECK:   tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, $noreg
+  ; CHECK:   t2IT 2, 8, implicit-def $itstate
+  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
+  ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+  ; CHECK:   t2CMPri $lr, 0, 14, $noreg, implicit-def $cpsr
+  ; CHECK:   tBcc %bb.1, 1, $cpsr
+  ; CHECK:   tB %bb.2, 14, $noreg
+  ; CHECK: bb.2.while.end:
+  ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+  ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2, $r7, $lr
+
+    frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
+    renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
+    t2DoLoopStart renamable $r0
+    renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
+    $lr = tMOVr killed $r0, 14, $noreg
+
+  bb.1.while.body:
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $r0, $r1, $r2
+
+    renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
+    tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
+    renamable $lr = t2LoopDec killed renamable $lr, 1
+    t2IT 2, 8, implicit-def $itstate
+    renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
+    early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+    t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14, $noreg
+
+  bb.2.while.end:
+    $r0, dead $cpsr = tMOVi8 0, 14, $noreg
+    tPOP_RET 14, $noreg, def $r7, def $pc, implicit killed $r0
+
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
index 02bf12ce62004..04f408d78acb8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll
@@ -9,28 +9,21 @@ define dso_local i32 @mul_reduce_add(i32* noalias nocapture readonly %a, i32* no
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r3, r2, #3
-; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    bic r3, r3, #3
-; CHECK-NEXT:    sub.w r12, r3, #4
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    vmov.i32 q1, #0x0
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB0_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldrw.u32 q2, [r1]
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
+; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
 ; CHECK-NEXT:    mov r3, r2
-; CHECK-NEXT:    vmul.i32 q0, q2, q0
-; CHECK-NEXT:    adds r0, #16
-; CHECK-NEXT:    adds r1, #16
+; CHECK-NEXT:    vmul.i32 q1, q2, q1
 ; CHECK-NEXT:    subs r2, #4
-; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q1, q1, q0
 ; CHECK-NEXT:    letp lr, .LBB0_1
 ; CHECK-NEXT:  @ %bb.2: @ %middle.block
 ; CHECK-NEXT:    vctp.32 r3
-; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vaddv.u32 r0, q0
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@@ -82,19 +75,13 @@ define dso_local i32 @mul_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r1, r2, #3
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB1_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB1_1
@@ -148,19 +135,13 @@ define dso_local i32 @add_reduce_add_const(i32* noalias nocapture readonly %a, i
 ; CHECK-NEXT:    moveq r0, #0
 ; CHECK-NEXT:    bxeq lr
 ; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    adds r1, r2, #3
-; CHECK-NEXT:    movs r3, #1
-; CHECK-NEXT:    bic r1, r1, #3
 ; CHECK-NEXT:    vmov.i32 q0, #0x0
-; CHECK-NEXT:    subs r1, #4
-; CHECK-NEXT:    add.w lr, r3, r1, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r2
 ; CHECK-NEXT:  .LBB2_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov r1, r2
 ; CHECK-NEXT:    vmov q1, q0
-; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    letp lr, .LBB2_1
@@ -213,20 +194,13 @@ define dso_local void @vector_mul_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB3_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vmul.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    adds r1, #16
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vmul.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB3_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -272,20 +246,13 @@ define dso_local void @vector_add_const(i32* noalias nocapture %a, i32* noalias
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    add.w r12, r3, #3
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #3
-; CHECK-NEXT:    sub.w r12, r12, #4
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #2
-; CHECK-NEXT:    dlstp.32 lr, lr
+; CHECK-NEXT:    dlstp.32 lr, r3
 ; CHECK-NEXT:  .LBB4_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrw.u32 q0, [r1]
-; CHECK-NEXT:    vadd.i32 q0, q0, r2
-; CHECK-NEXT:    vstrw.32 q0, [r0]
-; CHECK-NEXT:    adds r1, #16
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    subs r3, #4
+; CHECK-NEXT:    vadd.i32 q0, q0, r2
+; CHECK-NEXT:    vstrw.32 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB4_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
@@ -331,13 +298,8 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocaptur
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r4, pc}
-; CHECK-NEXT:    add.w r12, r3, #15
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #15
-; CHECK-NEXT:    sub.w r12, r12, #16
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #4
 ; CHECK-NEXT:    mov.w r12, #0
-; CHECK-NEXT:    dlstp.8 lr, lr
+; CHECK-NEXT:    dlstp.8 lr, r3
 ; CHECK-NEXT:  .LBB5_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    add.w r4, r1, r12
@@ -396,22 +358,14 @@ define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapt
 ; CHECK-NEXT:    cmp r3, #0
 ; CHECK-NEXT:    it eq
 ; CHECK-NEXT:    popeq {r7, pc}
-; CHECK-NEXT:    add.w r12, r3, #7
-; CHECK-NEXT:    mov.w lr, #1
-; CHECK-NEXT:    bic r12, r12, #7
-; CHECK-NEXT:    sub.w r12, r12, #8
-; CHECK-NEXT:    add.w lr, lr, r12, lsr #3
-; CHECK-NEXT:    dlstp.16 lr, lr
+; CHECK-NEXT:    dlstp.16 lr, r3
 ; CHECK-NEXT:  .LBB6_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    vldrh.u16 q0, [r1]
-; CHECK-NEXT:    vldrh.u16 q1, [r2]
-; CHECK-NEXT:    vmul.i16 q0, q1, q0
-; CHECK-NEXT:    vstrh.16 q0, [r0]
-; CHECK-NEXT:    adds r1, #16
-; CHECK-NEXT:    adds r2, #16
-; CHECK-NEXT:    adds r0, #16
+; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
+; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
 ; CHECK-NEXT:    subs r3, #8
+; CHECK-NEXT:    vmul.i16 q0, q1, q0
+; CHECK-NEXT:    vstrh.16 q0, [r0], #16
 ; CHECK-NEXT:    letp lr, .LBB6_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
 ; CHECK-NEXT:    pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
index dbf40f60cbd9a..38dc5ce54bcbd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-reduce-mve-tail.ll
@@ -6,13 +6,13 @@
 ; CHECK: vector.body:
 ; CHECK-NOT: phi i32 [ 0, %vector.ph ]
 ; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
-; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
+; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
 ; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
 
 ; CHECK: middle.block:
-; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
+; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
 ; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
 ; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
 
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
index 69f23f6050131..33389f4c2941c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -195,12 +195,7 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -8
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r3, 15, 14, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 15, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 16, 14, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 35, 14, $noreg, $noreg
-  ; CHECK:   $lr = MVE_WLSTP_8 renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_WLSTP_8 renamable $r3, %bb.1
   ; CHECK:   tB %bb.3, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
@@ -216,7 +211,7 @@ body:             |
   ; CHECK:   renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg
-  ; CHECK:   renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1)
   ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
@@ -257,7 +252,7 @@ body:             |
     renamable $r4 = t2ADDrr renamable $r0, renamable $r12, 14, $noreg, $noreg
     renamable $r12 = t2ADDri killed renamable $r12, 16, 14, $noreg, $noreg
     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg
-    renamable $q0 = MVE_VMULt1i8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
     MVE_VPST 8, implicit $vpr
     MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 1, killed renamable $vpr :: (store 16 into %ir.scevgep1, align 1)
     renamable $lr = t2LoopDec killed renamable $lr, 1
@@ -323,19 +318,14 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r12 = t2ADDri renamable $r3, 7, 14, $noreg, $noreg
-  ; CHECK:   renamable $lr = t2MOVi 1, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 7, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r12, 8, 14, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 27, 14, $noreg, $noreg
-  ; CHECK:   $lr = MVE_WLSTP_16 renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_WLSTP_16 renamable $r3, %bb.1
   ; CHECK:   tB %bb.2, 14, $noreg
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2)
   ; CHECK:   renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2)
-  ; CHECK:   renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+  ; CHECK:   renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 2)
   ; CHECK:   renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg
@@ -368,7 +358,7 @@ body:             |
     MVE_VPST 4, implicit $vpr
     renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv57, align 2)
     renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 2)
-    renamable $q0 = MVE_VMULt1i16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
+    renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
     MVE_VPST 8, implicit $vpr
     MVE_VSTRHU16 killed renamable $q0, renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 2)
     renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
@@ -437,13 +427,8 @@ body:             |
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   renamable $r3, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
-  ; CHECK:   renamable $r3 = t2BICri killed renamable $r3, 3, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 4, 14, $noreg, $noreg
-  ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 1, 14, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
-  ; CHECK:   $lr = MVE_WLSTP_32 renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_WLSTP_32 $r2, %bb.1
   ; CHECK:   tB %bb.4, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
@@ -456,7 +441,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4)
   ; CHECK:   renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, killed $noreg :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   $r3 = tMOVr $r2, 14, $noreg
-  ; CHECK:   renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+  ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
   ; CHECK:   renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg
   ; CHECK:   renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg
@@ -505,7 +490,7 @@ body:             |
     renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4)
     renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
     $r3 = tMOVr $r2, 14, $noreg
-    renamable $q1 = nsw MVE_VMULt1i32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
+    renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
     renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg
     renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
     renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/ifcvt-neon-deprecated.mir b/llvm/test/CodeGen/Thumb2/ifcvt-neon-deprecated.mir
index 58ddfcc2a683e..1f5edb0c78b91 100644
--- a/llvm/test/CodeGen/Thumb2/ifcvt-neon-deprecated.mir
+++ b/llvm/test/CodeGen/Thumb2/ifcvt-neon-deprecated.mir
@@ -1,54 +1,89 @@
-# RUN: llc -mtriple=thumbv7 -start-before=if-converter -o - %s | FileCheck %s
+# RUN: llc -mtriple=thumbv7 -start-before=if-converter %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = 'vdup-test.ll'
+  source_filename = "vdup-test.ll"
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7"
+  
+  define arm_aapcs_vfpcc <2 x i32> @NeonVdupMul(i32 %scalar, i32 %N, <2 x i32> %vector) {
+  entry:
+    %cmp = icmp ne i32 %N, 0
+    %broadcast = insertelement <2 x i32> undef, i32 %scalar, i32 0
+    %dup = shufflevector <2 x i32> %broadcast, <2 x i32> undef, <2 x i32> zeroinitializer
+    %mul = mul <2 x i32> %dup, %vector
+    br i1 %cmp, label %select.end, label %select.false
+  
+  select.false:                                     ; preds = %entry
+    br label %select.end
+  
+  select.end:                                       ; preds = %entry, %select.false
+    %res = phi <2 x i32> [ %mul, %entry ], [ %vector, %select.false ]
+    ret <2 x i32> %res
+  }
+
+...
 ---
 name:            NeonVdupMul
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+  - { reg: '$d0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  maxCallFrameSize: 0
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+callSites:       []
+constants:       []
+machineFunctionInfo: {}
 body:             |
-  bb.0:
-    successors: %bb.2, %bb.1
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.2(0x30000000)
     liveins: $d0, $r0, $r1
-
-    t2CMPri killed $r1, 0, 14, $noreg, implicit-def $cpsr
+  
+    t2CMPri killed renamable $r1, 0, 14, $noreg, implicit-def $cpsr
     t2Bcc %bb.2, 0, killed $cpsr
-
+  
   bb.1:
+    successors: %bb.2(0x80000000)
     liveins: $d0, $r0
-
-    $d16 = VDUP32d killed $r0, 14, $noreg
+  
+    renamable $d16 = VDUP32d killed renamable $r0, 14, $noreg
     ; Verify that the neon instructions haven't been conditionalized:
     ; CHECK-LABEL: NeonVdupMul
     ; CHECK: vdup.32
     ; CHECK: vmul.i32
-    $d0 = VMULv2i32 killed $d16, killed $d0, 14, $noreg
-
-  bb.2:
+    renamable $d0 = VMULv2i32 killed renamable $d16, killed renamable $d0, 14, $noreg
+  
+  bb.2.select.end:
     liveins: $d0
-
-    tBX_RET 14, $noreg, implicit $d0
-
-...
----
-name:            NeonVmovVfpLdr
-body:             |
-  bb.0.entry:
-    successors: %bb.1, %bb.2
-    liveins: $r0, $r1
-
-    t2CMPri killed $r1, 0, 14, $noreg, implicit-def $cpsr
-    t2Bcc %bb.2, 1, killed $cpsr
-
-  bb.1:
-    $d0 = VMOVv2i32 0, 14, $noreg
-    tBX_RET 14, $noreg, implicit $d0
-
-  bb.2:
-    liveins: $r0
-
-    $d0 = VLDRD killed $r0, 0, 14, $noreg
-    ; Verify that the neon instruction VMOVv2i32 hasn't been conditionalized,
-    ; but the VLDR instruction that is available both in the VFP and Advanced
-    ; SIMD extensions has.
-    ; CHECK-LABEL: NeonVmovVfpLdr
-    ; CHECK-DAG: vmov.i32 d0, #0x0
-    ; CHECK-DAG: vldr{{ne|eq}} d0, [r0]
+  
     tBX_RET 14, $noreg, implicit $d0
 
 ...
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
new file mode 100644
index 0000000000000..f5b541203f6a3
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/predicates.ll
@@ -0,0 +1,219 @@
+; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - | FileCheck %s
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+declare <4 x i1> @llvm.arm.mve.vctp64(i32)
+
+declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
+declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q(i32 %a) {
+; CHECK-LABEL: test_vctp8q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp8q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp8q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.8 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %a)
+  %3 = and <16 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q(i32 %a) {
+; CHECK-LABEL: test_vctp16q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp16q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp16q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.16 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %a)
+  %3 = and <8 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q(i32 %a) {
+; CHECK-LABEL: test_vctp32q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp32q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp32q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.32 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q(i32 %a) {
+; CHECK-LABEL: test_vctp64q:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  ret i16 %2
+}
+
+define arm_aapcs_vfpcc zeroext i16 @test_vctp64q_m(i32 %a, i16 zeroext %p) {
+; CHECK-LABEL: test_vctp64q_m:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vctpt.64 r0
+; CHECK-NEXT:    vmrs r0, p0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = call <4 x i1> @llvm.arm.mve.vctp64(i32 %a)
+  %3 = and <4 x i1> %1, %2
+  %4 = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %3)
+  %5 = trunc i32 %4 to i16
+  ret i16 %5
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vpselq_i8(<16 x i8> %a, <16 x i8> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = select <16 x i1> %1, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %2
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vpselq_i16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %2
+}
+
+define arm_aapcs_vfpcc <8 x half> @test_vpselq_f16(<8 x half> %a, <8 x half> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = select <8 x i1> %1, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %2
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vpselq_i32(<4 x i32> %a, <4 x i32> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %2
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vpselq_f32(<4 x float> %a, <4 x float> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = select <4 x i1> %1, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %2
+}
+
+define arm_aapcs_vfpcc <2 x i64> @test_vpselq_i64(<2 x i64> %a, <2 x i64> %b, i16 zeroext %p) #2 {
+; CHECK-LABEL: test_vpselq_i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = bitcast <2 x i64> %a to <4 x i32>
+  %3 = bitcast <2 x i64> %b to <4 x i32>
+  %4 = select <4 x i1> %1, <4 x i32> %2, <4 x i32> %3
+  %5 = bitcast <4 x i32> %4 to <2 x i64>
+  ret <2 x i64> %5
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
new file mode 100644
index 0000000000000..bafff00ea1de9
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vabdq.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vabdq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabd.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>%a, <4 x i32>%b)
+  ret <4 x i32> %0
+}
+
+declare <4 x i32> @llvm.arm.mve.vabd.v4i32(<4 x i32>, <4 x i32>)
+
+define arm_aapcs_vfpcc <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vabdq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vabd.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.arm.mve.vabd.v4f32(<4 x float>%a, <4 x float>%b)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.arm.mve.vabd.v4f32(<4 x float>, <4 x float>)
+
+define arm_aapcs_vfpcc <16 x i8> @test_vabdq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vabdq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.s8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+declare <16 x i8> @llvm.arm.mve.abd.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+
+define arm_aapcs_vfpcc <8 x half> @test_vabdq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vabdq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vabdt.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  ret <8 x half> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+declare <8 x half> @llvm.arm.mve.abd.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
new file mode 100644
index 0000000000000..1b1d498bc378d
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vandq.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <16 x i8> %b, %a
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <4 x i32> %b, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = and <8 x i16> %b, %a
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vandq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vandq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vand q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = and <4 x i32> %1, %0
+  %3 = bitcast <4 x i32> %2 to <4 x float>
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vandq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.and.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.and.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vandq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.and.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+; Function Attrs: nounwind readnone
+define arm_aapcs_vfpcc <8 x half> @test_vandq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vandq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vandt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = zext i16 %p to i32
+  %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  %4 = bitcast <4 x float> %inactive to <4 x i32>
+  %5 = tail call <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
+  %6 = bitcast <4 x i32> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.and.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
new file mode 100644
index 0000000000000..47877a13cb96e
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vbicq.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %1 = and <16 x i8> %0, %a
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = and <4 x i32> %0, %a
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %1 = and <8 x i16> %0, %a
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vbicq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vbicq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = and <4 x i32> %2, %0
+  %4 = bitcast <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vbicq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.bic.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.bic.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.bic.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+; Function Attrs: nounwind readnone
+define arm_aapcs_vfpcc <8 x half> @test_vbicq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vbicq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vbict q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = zext i16 %p to i32
+  %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  %4 = bitcast <4 x float> %inactive to <4 x i32>
+  %5 = tail call <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
+  %6 = bitcast <4 x i32> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.bic.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
new file mode 100644
index 0000000000000..9b66f3656eb27
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/veorq.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <16 x i8> %b, %a
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <4 x i32> %b, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <8 x i16> %b, %a
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_veorq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_veorq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    veor q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = xor <4 x i32> %1, %0
+  %3 = bitcast <4 x i32> %2 to <4 x float>
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_veorq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.eor.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.eor.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_veorq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.eor.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+; Function Attrs: nounwind readnone
+define arm_aapcs_vfpcc <8 x half> @test_veorq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_veorq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    veort q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = zext i16 %p to i32
+  %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  %4 = bitcast <4 x float> %inactive to <4 x i32>
+  %5 = tail call <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
+  %6 = bitcast <4 x i32> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.eor.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
new file mode 100644
index 0000000000000..d89308bb59412
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxnmq.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxnm.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.maxnum.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
+}
+
+declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>) #1
+
+define dso_local arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmaxnm.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #1
+
+define dso_local arm_aapcs_vfpcc <8 x half> @test_vmaxnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  ret <8 x half> %2
+}
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x half> @llvm.arm.mve.max.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+
+define dso_local arm_aapcs_vfpcc <4 x float> @test_vmaxnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxnmq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmaxnmt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x float> @llvm.arm.mve.max.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
new file mode 100644
index 0000000000000..09a7d60cd1650
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmaxq.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp ugt <16 x i8> %a, %b
+  %1 = select <16 x i1> %0, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %1
+}
+
+define dso_local arm_aapcs_vfpcc <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp sgt <8 x i16> %a, %b
+  %1 = select <8 x i1> %0, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %1
+}
+
+define dso_local arm_aapcs_vfpcc <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmaxq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp ugt <4 x i32> %a, %b
+  %1 = select <4 x i1> %0, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %1
+}
+
+define dso_local arm_aapcs_vfpcc <16 x i8> @test_vmaxq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define dso_local arm_aapcs_vfpcc <8 x i16> @test_vmaxq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+define dso_local arm_aapcs_vfpcc <4 x i32> @test_vmaxq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vmaxq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
new file mode 100644
index 0000000000000..10cd674d39a8f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminnmq.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc <8 x half> @test_vminnmq_f16(<8 x half> %a, <8 x half> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminnm.f16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x half> @llvm.minnum.v8f16(<8 x half> %a, <8 x half> %b)
+  ret <8 x half> %0
+}
+
+declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>) #1
+
+define dso_local arm_aapcs_vfpcc <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vminnm.f32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %0
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #1
+
+define dso_local arm_aapcs_vfpcc <8 x half> @test_vminnmq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  ret <8 x half> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x half> @llvm.arm.mve.min.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>) #2
+
+define dso_local arm_aapcs_vfpcc <4 x float> @test_vminnmq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminnmq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vminnmt.f32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %1, <4 x float> %inactive)
+  ret <4 x float> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x float> @llvm.arm.mve.min.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
new file mode 100644
index 0000000000000..0cbef86c928f7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminq.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define dso_local arm_aapcs_vfpcc <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.u8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp ugt <16 x i8> %a, %b
+  %1 = select <16 x i1> %0, <16 x i8> %b, <16 x i8> %a
+  ret <16 x i8> %1
+}
+
+define dso_local arm_aapcs_vfpcc <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp sgt <8 x i16> %a, %b
+  %1 = select <8 x i1> %0, <8 x i16> %b, <8 x i16> %a
+  ret <8 x i16> %1
+}
+
+define dso_local arm_aapcs_vfpcc <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vminq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmin.u32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = icmp ugt <4 x i32> %a, %b
+  %1 = select <4 x i1> %0, <4 x i32> %b, <4 x i32> %a
+  ret <4 x i32> %1
+}
+
+define dso_local arm_aapcs_vfpcc <16 x i8> @test_vminq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.min.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define dso_local arm_aapcs_vfpcc <8 x i16> @test_vminq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.min.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+define dso_local arm_aapcs_vfpcc <4 x i32> @test_vminq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vminq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmint.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.min.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
new file mode 100644
index 0000000000000..78ee17b554160
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulhq.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmulh.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.arm.mve.vmulh.v16i8(<16 x i8>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmulh.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %0
+}
+
+declare <8 x i16> @llvm.arm.mve.vmulh.v8i16(<8 x i16>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmulh.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+declare <4 x i32> @llvm.arm.mve.vmulh.v4i32(<4 x i32>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
+
+declare <16 x i8> @llvm.arm.mve.mulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
+
+declare <8 x i16> @llvm.arm.mve.mulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vmulhq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmulht.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
+
+declare <4 x i32> @llvm.arm.mve.mulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
new file mode 100644
index 0000000000000..09d8e11a71aed
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmulq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.i32 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = mul <4 x i32> %b, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_vmulq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f32 q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fmul <4 x float> %b, %a
+  ret <4 x float> %0
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vmulq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.i8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
+
+declare <16 x i8> @llvm.arm.mve.mul.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+
+define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_f16(<8 x half> %inactive, <8 x half> %a, <8 x half> %b, i16 zeroext %p) {
+; CHECK-LABEL: test_vmulq_m_f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vmult.f16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %1, <8 x half> %inactive)
+  ret <8 x half> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
+
+declare <8 x half> @llvm.arm.mve.mul.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>, <8 x half>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
new file mode 100644
index 0000000000000..48f6a3cd23ad2
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vornq.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorn q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %1 = or <16 x i8> %0, %a
+  ret <16 x i8> %1
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorn q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = or <4 x i32> %0, %a
+  ret <4 x i32> %1
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorn q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+  %1 = or <8 x i16> %0, %a
+  ret <8 x i16> %1
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vornq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vornq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorn q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %3 = or <4 x i32> %2, %0
+  %4 = bitcast <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vornq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.orn.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.orn.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vornq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.orn.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+; Function Attrs: nounwind readnone
+define arm_aapcs_vfpcc <8 x half> @test_vornq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vornq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vornt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = zext i16 %p to i32
+  %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  %4 = bitcast <4 x float> %inactive to <4 x i32>
+  %5 = tail call <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
+  %6 = bitcast <4 x i32> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.orn.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
new file mode 100644
index 0000000000000..ccb511a85e571
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vorrq.ll
@@ -0,0 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <16 x i8> %b, %a
+  ret <16 x i8> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <4 x i32> %b, %a
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = or <8 x i16> %b, %a
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x float> @test_vorrq_f32(<4 x float> %a, <4 x float> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vorrq_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = or <4 x i32> %1, %0
+  %3 = bitcast <4 x i32> %2 to <4 x float>
+  ret <4 x float> %3
+}
+
+define arm_aapcs_vfpcc <16 x i8> @test_vorrq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #2
+
+declare <16 x i8> @llvm.arm.mve.orr.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #2
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #2
+
+declare <8 x i16> @llvm.arm.mve.orr.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #2
+
+; Function Attrs: nounwind readnone
+define arm_aapcs_vfpcc <8 x half> @test_vorrq_m_f32(<4 x float> %inactive, <4 x float> %a, <4 x float> %b, i16 zeroext %p) local_unnamed_addr #1 {
+; CHECK-LABEL: test_vorrq_m_f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vorrt q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = bitcast <4 x float> %a to <4 x i32>
+  %1 = bitcast <4 x float> %b to <4 x i32>
+  %2 = zext i16 %p to i32
+  %3 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2)
+  %4 = bitcast <4 x float> %inactive to <4 x i32>
+  %5 = tail call <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32> %0, <4 x i32> %1, <4 x i1> %3, <4 x i32> %4)
+  %6 = bitcast <4 x i32> %5 to <8 x half>
+  ret <8 x half> %6
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #2
+
+declare <4 x i32> @llvm.arm.mve.orr.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #2
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
new file mode 100644
index 0000000000000..3975e4eca8727
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vrmulhq.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_u8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_u8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmulh.s8 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8> %a, <16 x i8> %b)
+  ret <16 x i8> %0
+}
+
+declare <16 x i8> @llvm.arm.mve.vrmulh.v16i8(<16 x i8>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_s16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_s16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmulh.s16 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16> %a, <8 x i16> %b)
+  ret <8 x i16> %0
+}
+
+declare <8 x i16> @llvm.arm.mve.vrmulh.v8i16(<8 x i16>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_u32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_u32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vrmulh.s32 q0, q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32> %a, <4 x i32> %b)
+  ret <4 x i32> %0
+}
+
+declare <4 x i32> @llvm.arm.mve.vrmulh.v4i32(<4 x i32>, <4 x i32>) #1
+
+define arm_aapcs_vfpcc <16 x i8> @test_vrmulhq_m_s8(<16 x i8> %inactive, <16 x i8> %a, <16 x i8> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_m_s8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s8 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
+  %2 = tail call <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8> %a, <16 x i8> %b, <16 x i1> %1, <16 x i8> %inactive)
+  ret <16 x i8> %2
+}
+
+declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) #1
+
+declare <16 x i8> @llvm.arm.mve.rmulh.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>) #1
+
+define arm_aapcs_vfpcc <8 x i16> @test_vrmulhq_m_u16(<8 x i16> %inactive, <8 x i16> %a, <8 x i16> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_m_u16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
+  %2 = tail call <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16> %a, <8 x i16> %b, <8 x i1> %1, <8 x i16> %inactive)
+  ret <8 x i16> %2
+}
+
+declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) #1
+
+declare <8 x i16> @llvm.arm.mve.rmulh.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) #1
+
+define arm_aapcs_vfpcc <4 x i32> @test_vrmulhq_m_s32(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i16 zeroext %p) local_unnamed_addr #0 {
+; CHECK-LABEL: test_vrmulhq_m_s32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmsr p0, r0
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vrmulht.s32 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i1> %1, <4 x i32> %inactive)
+  ret <4 x i32> %2
+}
+
+declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #1
+
+declare <4 x i32> @llvm.arm.mve.rmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) #1
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll
new file mode 100644
index 0000000000000..ba3ef58c3c2eb
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll
@@ -0,0 +1,2646 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+
+define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #2
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #512
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #512
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #2]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #254]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #256
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #-254]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #256
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #2]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #254]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #256
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #-254]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #256
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #256
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #256
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #2]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #127]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #128
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #-127]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #128
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #2]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #127]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #128
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #-127]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #128
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %x
+}
+
+define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #127]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #128
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #-127]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #128
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #127]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #128
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #-127]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #128
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %x
+}
+
+define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #4]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #3]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #2]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #127]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #128
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r3]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #-127]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #128
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r3]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %x
+}
+
+define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #2
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #512
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #512
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r3]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %x
+}
+
+define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    adds r3, r0, #3
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    add.w r3, r0, #256
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    sub.w r3, r0, #256
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r3]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %x
+}
+
+
+
+
+define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #512
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #512
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #256
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #-254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #256
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #256
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #256
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #128
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #-127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #128
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #128
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #-127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #128
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #3]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #128
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #-127]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #128
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #2
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #512
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #512
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #4]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    adds r1, r0, #3
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #2]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    add.w r1, r0, #256
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    sub.w r1, r0, #256
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %y
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll
new file mode 100644
index 0000000000000..0951589eaa14c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll
@@ -0,0 +1,2646 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+
+define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #508
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #-508
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0], #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0], #254
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0], #-254
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0], #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0], #254
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0], #-254
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #2
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #254
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #-254
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #127
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0], #-127
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0], #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0], #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0], #127
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0], #-127
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #2
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #127
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0], #-127
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0], #4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0], #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0], #2
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0], #127
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0], #-127
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #4
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #3
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #2
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #127
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0], #-127
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #4
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #508
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0], #-508
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #4
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #2
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #254
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0], #-254
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+
+
+
+define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #508
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #-508
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %y to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0], #254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0], #-254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #-254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %y to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0], #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0], #127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0], #-127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0], #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0], #127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0], #-127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0], #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0], #127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0], #-127
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %y to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #508
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0], #-508
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %y to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #4
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #2
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0], #-254
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %y to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll
new file mode 100644
index 0000000000000..beb5aae634116
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll
@@ -0,0 +1,2646 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+
+define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwu32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %z to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef)
+  %2 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %1, <4 x i32>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #2]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #254]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0, #-254]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhu32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #2]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #254]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0, #-254]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrhs32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrht.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef)
+  %2 = sext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhu16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef)
+  %2 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %1, <8 x i16>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #3]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #2]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #127]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0, #-127]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbu32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #3]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #2]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #127]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0, #-127]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrbs32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef)
+  %2 = sext <4 x i8> %1 to <4 x i32>
+  %3 = bitcast i8* %y to <4 x i32>*
+  store <4 x i32> %2, <4 x i32>* %3, align 4
+  ret i8* %z
+}
+
+define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #4]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #3]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #2]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #127]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0, #-127]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbu16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = zext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #4]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #3]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #2]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #127]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0, #-127]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrbs16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrbt.s16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef)
+  %2 = sext <8 x i8> %1 to <8 x i16>
+  %3 = bitcast i8* %y to <8 x i16>*
+  store <8 x i16> %2, <8 x i16>* %3, align 2
+  ret i8* %z
+}
+
+define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #4]!
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #3]!
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #2]!
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #127]!
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 127
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 128
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0, #-127]!
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -127
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) {
+; CHECK-LABEL: ldrbu8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q0, zr
+; CHECK-NEXT:    vldrbt.u8 q0, [r0]
+; CHECK-NEXT:    vstrb.8 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -128
+  %0 = bitcast i8* %z to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef)
+  %2 = bitcast i8* %y to <16 x i8>*
+  store <16 x i8> %1, <16 x i8>* %2, align 1
+  ret i8* %z
+}
+
+define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #4]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #508]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 508
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 512
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0, #-508]!
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -508
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) {
+; CHECK-LABEL: ldrwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q0, zr
+; CHECK-NEXT:    vldrwt.u32 q0, [r0]
+; CHECK-NEXT:    vstrw.32 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -512
+  %0 = bitcast i8* %z to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef)
+  %2 = bitcast i8* %y to <4 x float>*
+  store <4 x float> %1, <4 x float>* %2, align 4
+  ret i8* %z
+}
+
+define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #4]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 4
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 3
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #2]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 2
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #254]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 254
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 256
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0, #-254]!
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -254
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) {
+; CHECK-LABEL: ldrhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q0, zr
+; CHECK-NEXT:    vldrht.u16 q0, [r0]
+; CHECK-NEXT:    vstrh.16 q0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %x, i32 -256
+  %0 = bitcast i8* %z to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef)
+  %2 = bitcast i8* %y to <8 x half>*
+  store <8 x half> %1, <8 x half>* %2, align 2
+  ret i8* %z
+}
+
+
+
+
+define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strw32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x i32>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i32>, <4 x i32>* %0, align 4
+  %2 = bitcast i8* %z to <4 x i32>*
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0, #-254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strh32_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrht.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <4 x i16>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i16>, <4 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <4 x i16>*
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strh16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x i16>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i16>, <8 x i16>* %0, align 2
+  %2 = bitcast i8* %z to <8 x i16>*
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #3]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0, #-127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strb32_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrbt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <4 x i8>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x i8>, <4 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <4 x i8>*
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #3]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0, #-127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strb16_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrb.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrbt.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <8 x i8>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <8 x i8>*
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #3]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m127:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0, #-127]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -127
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) {
+; CHECK-LABEL: strb8_m128:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    subs r0, #128
+; CHECK-NEXT:    vldrb.u8 q0, [r1]
+; CHECK-NEXT:    vldrb.u8 q1, [r2]
+; CHECK-NEXT:    vpt.i8 ne, q1, zr
+; CHECK-NEXT:    vstrbt.8 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -128
+  %0 = bitcast i8* %x to <16 x i8>*
+  %mask = load <16 x i8>, <16 x i8>* %m, align 1
+  %c = icmp ne <16 x i8> %mask, zeroinitializer
+  %1 = load <16 x i8>, <16 x i8>* %0, align 1
+  %2 = bitcast i8* %z to <16 x i8>*
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #508]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m508:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0, #-508]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -508
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) {
+; CHECK-LABEL: strwf32_m512:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #512
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vldrw.u32 q1, [r2]
+; CHECK-NEXT:    vpt.i32 ne, q1, zr
+; CHECK-NEXT:    vstrwt.32 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -512
+  %0 = bitcast i8* %x to <4 x float>*
+  %mask = load <4 x i32>, <4 x i32>* %m, align 4
+  %c = icmp ne <4 x i32> %mask, zeroinitializer
+  %1 = load <4 x float>, <4 x float>* %0, align 4
+  %2 = bitcast i8* %z to <4 x float>*
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_4:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #4]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 4
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_3:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    adds r0, #3
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 3
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_2:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #2]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 2
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    add.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m254:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0, #-254]!
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -254
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) {
+; CHECK-LABEL: strhf16_m256:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    sub.w r0, r0, #256
+; CHECK-NEXT:    vldrh.u16 q0, [r1]
+; CHECK-NEXT:    vldrh.u16 q1, [r2]
+; CHECK-NEXT:    vpt.i16 ne, q1, zr
+; CHECK-NEXT:    vstrht.16 q0, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %z = getelementptr inbounds i8, i8* %y, i32 -256
+  %0 = bitcast i8* %x to <8 x half>*
+  %mask = load <8 x i16>, <8 x i16>* %m, align 2
+  %c = icmp ne <8 x i16> %mask, zeroinitializer
+  %1 = load <8 x half>, <8 x half>* %0, align 2
+  %2 = bitcast i8* %z to <8 x half>*
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c)
+  ret i8* %z
+}
+
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
+
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
index 46b64c8e4d8b1..100a082fd12be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll
@@ -13,8 +13,8 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
-  call void @llvm.masked.store.v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1)
+  %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -29,9 +29,9 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
+  %2 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = sext <4 x i8> %2 to <4 x i32>
-  call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -46,9 +46,9 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
+  %2 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = sext <4 x i16> %2 to <4 x i32>
-  call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -63,9 +63,9 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *%
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
+  %2 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef)
   %3 = zext <4 x i8> %2 to <4 x i32>
-  call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -80,9 +80,9 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16>
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
+  %2 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef)
   %3 = zext <4 x i16> %2 to <4 x i32>
-  call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -234,9 +234,9 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 entry:
   %0 = load <2 x i32>, <2 x i32>* %mask, align 4
   %1 = icmp sgt <2 x i32> %0, zeroinitializer
-  %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
+  %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
   %3 = sext <2 x i32> %2 to <2 x i64>
-  call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
+  call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
   ret void
 }
 
@@ -392,9 +392,9 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 entry:
   %0 = load <2 x i32>, <2 x i32>* %mask, align 4
   %1 = icmp sgt <2 x i32> %0, zeroinitializer
-  %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
+  %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
   %3 = sext <2 x i32> %2 to <2 x i64>
-  call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
+  call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
   ret void
 }
 
@@ -549,9 +549,9 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32>
 entry:
   %0 = load <2 x i32>, <2 x i32>* %mask, align 4
   %1 = icmp sgt <2 x i32> %0, zeroinitializer
-  %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
+  %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef)
   %3 = zext <2 x i32> %2 to <2 x i64>
-  call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
+  call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1)
   ret void
 }
 
@@ -710,9 +710,9 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask,
 entry:
   %0 = load <2 x i32>, <2 x i32>* %mask, align 4
   %1 = icmp sgt <2 x i32> %0, zeroinitializer
-  %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
+  %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef)
   %3 = zext <2 x i32> %2 to <2 x i64>
-  call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
+  call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1)
   ret void
 }
 
@@ -727,8 +727,8 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
   %1 = icmp sgt <8 x i16> %0, zeroinitializer
-  %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
-  call void @llvm.masked.store.v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1)
+  %2 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1)
   ret void
 }
 
@@ -743,9 +743,9 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
   %1 = icmp sgt <8 x i16> %0, zeroinitializer
-  %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
+  %2 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = sext <8 x i8> %2 to <8 x i16>
-  call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
   ret void
 }
 
@@ -760,9 +760,9 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *%
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
   %1 = icmp sgt <8 x i16> %0, zeroinitializer
-  %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
+  %2 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef)
   %3 = zext <8 x i8> %2 to <8 x i16>
-  call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
+  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1)
   ret void
 }
 
@@ -777,8 +777,8 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src
 entry:
   %0 = load <16 x i8>, <16 x i8>* %mask, align 1
   %1 = icmp sgt <16 x i8> %0, zeroinitializer
-  %2 = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef)
-  call void @llvm.masked.store.v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1)
+  %2 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef)
+  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1)
   ret void
 }
 
@@ -793,9 +793,9 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> *
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
   %1 = icmp sgt <8 x i16> %0, zeroinitializer
-  %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
+  %2 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef)
   %3 = trunc <8 x i16> %2 to <8 x i8>
-  call void @llvm.masked.store.v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1)
+  call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1)
   ret void
 }
 
@@ -810,9 +810,9 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> *
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = trunc <4 x i32> %2 to <4 x i8>
-  call void @llvm.masked.store.v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1)
+  call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1)
   ret void
 }
 
@@ -827,9 +827,9 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32>
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
+  %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef)
   %3 = trunc <4 x i32> %2 to <4 x i16>
-  call void @llvm.masked.store.v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1)
+  call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1)
   ret void
 }
 
@@ -844,8 +844,8 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> *
 entry:
   %0 = load <4 x i32>, <4 x i32>* %mask, align 4
   %1 = icmp sgt <4 x i32> %0, zeroinitializer
-  %2 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef)
-  call void @llvm.masked.store.v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1)
+  %2 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1)
   ret void
 }
 
@@ -860,8 +860,8 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s
 entry:
   %0 = load <8 x i16>, <8 x i16>* %mask, align 2
   %1 = icmp sgt <8 x i16> %0, zeroinitializer
-  %2 = call <8 x half> @llvm.masked.load.v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef)
-  call void @llvm.masked.store.v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1)
+  %2 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef)
+  call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1)
   ret void
 }
 
@@ -991,9 +991,9 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *%
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer
-  %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
+  %2 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
   %3 = fpext <4 x half> %2 to <4 x float>
-  call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1)
   ret void
 }
 
@@ -1123,29 +1123,29 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4
 entry:
   %0 = load <4 x i16>, <4 x i16>* %mask, align 2
   %1 = icmp sgt <4 x i16> %0, zeroinitializer
-  %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
+  %2 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef)
   %3 = fpext <4 x half> %2 to <4 x float>
-  call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1)
+  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1)
   ret void
 }
 
-declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
-declare void @llvm.masked.store.v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
-declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
-declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
-declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
-declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
-declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
-declare <4 x half> @llvm.masked.load.v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>)
-declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
+declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>)
+declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)
+declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>)
+declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>)
+declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>)
 
-declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
-declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
-declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
-declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
-declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
-declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
+declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>)
+declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>)
+declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>)
+declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>)
+declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>)
+declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>)
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
index e75e07604e879..54a94b8981c2e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll
@@ -468,8 +468,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]!
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -477,8 +476,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]!
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -495,8 +493,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4i32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0], #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -504,8 +501,7 @@ define arm_aapcs_vfpcc i8* @masked_v4i32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0], #4
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1032,8 +1028,7 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-NEXT:    vldr d1, [sp]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]!
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1043,8 +1038,7 @@ define i8* @masked_v8i16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]!
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1061,8 +1055,7 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8i16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0], #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1070,8 +1063,7 @@ define arm_aapcs_vfpcc i8* @masked_v8i16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0], #4
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1151,8 +1143,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0, #4]!
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1160,8 +1151,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_preinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0, #4]!
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1178,8 +1168,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-LE-LABEL: masked_v16i8_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrbt.u8 q0, [r0], #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1187,8 +1176,7 @@ define arm_aapcs_vfpcc i8* @masked_v16i8_postinc(i8* %x, i8* %y, <16 x i8> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.8 q1, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q1, zr
-; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrbt.u8 q0, [r0], #4
 ; CHECK-BE-NEXT:    vstrb.8 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1355,8 +1343,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0, #4]!
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1364,8 +1351,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_preinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0, #4]!
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1382,8 +1368,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-LE-LABEL: masked_v4f32_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrwt.u32 q0, [r0], #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1391,8 +1376,7 @@ define arm_aapcs_vfpcc i8* @masked_v4f32_postinc(i8* %x, i8* %y, <4 x i32> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.32 q1, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q1, zr
-; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrwt.u32 q0, [r0], #4
 ; CHECK-BE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1724,8 +1708,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_preinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0, #4]!
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1733,8 +1716,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_preinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0, #4]!
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
@@ -1751,8 +1733,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-LE-LABEL: masked_v8f16_postinc:
 ; CHECK-LE:       @ %bb.0: @ %entry
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vldrht.u16 q0, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vldrht.u16 q0, [r0], #4
 ; CHECK-LE-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-LE-NEXT:    bx lr
 ;
@@ -1760,8 +1741,7 @@ define arm_aapcs_vfpcc i8* @masked_v8f16_postinc(i8* %x, i8* %y, <8 x i16> %a) {
 ; CHECK-BE:       @ %bb.0: @ %entry
 ; CHECK-BE-NEXT:    vrev64.16 q1, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q1, zr
-; CHECK-BE-NEXT:    vldrht.u16 q0, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vldrht.u16 q0, [r0], #4
 ; CHECK-BE-NEXT:    vstrh.16 q0, [r1]
 ; CHECK-BE-NEXT:    bx lr
 entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
index 1fc9793fd50d4..425162721acf4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-masked-store.ll
@@ -111,8 +111,7 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_pre:
@@ -122,8 +121,7 @@ define i8* @masked_v4i32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -142,8 +140,7 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4i32_post:
@@ -153,8 +150,7 @@ define i8* @masked_v4i32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -334,8 +330,7 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_pre:
@@ -345,8 +340,7 @@ define i8* @masked_v8i16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -365,8 +359,7 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8i16_post:
@@ -376,8 +369,7 @@ define i8* @masked_v8i16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -416,8 +408,7 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_pre:
@@ -427,8 +418,7 @@ define i8* @masked_v16i8_pre(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
-; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0, #4]!
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -447,8 +437,7 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s8 gt, q0, zr
-; CHECK-LE-NEXT:    vstrbt.8 q1, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrbt.8 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v16i8_post:
@@ -458,8 +447,7 @@ define i8* @masked_v16i8_post(i8* %y, i8* %x, <16 x i8> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.8 q2, q0
 ; CHECK-BE-NEXT:    vpt.s8 gt, q2, zr
-; CHECK-BE-NEXT:    vstrbt.8 q1, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrbt.8 q1, [r0], #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -591,8 +579,7 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_pre:
@@ -602,8 +589,7 @@ define i8* @masked_v4f32_pre(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0, #4]!
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -622,8 +608,7 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s32 gt, q0, zr
-; CHECK-LE-NEXT:    vstrwt.32 q1, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v4f32_post:
@@ -633,8 +618,7 @@ define i8* @masked_v4f32_post(i8* %y, i8* %x, <4 x i32> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.32 q2, q0
 ; CHECK-BE-NEXT:    vpt.s32 gt, q2, zr
-; CHECK-BE-NEXT:    vstrwt.32 q1, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrwt.32 q1, [r0], #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -904,8 +888,7 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_pre:
@@ -915,8 +898,7 @@ define i8* @masked_v8f16_pre(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0, #4]!
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
@@ -935,8 +917,7 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-LE-NEXT:    vldrw.u32 q1, [r1]
 ; CHECK-LE-NEXT:    vmov d0, r2, r3
 ; CHECK-LE-NEXT:    vpt.s16 gt, q0, zr
-; CHECK-LE-NEXT:    vstrht.16 q1, [r0]
-; CHECK-LE-NEXT:    adds r0, #4
+; CHECK-LE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-LE-NEXT:    bx lr
 ;
 ; CHECK-BE-LABEL: masked_v8f16_post:
@@ -946,8 +927,7 @@ define i8* @masked_v8f16_post(i8* %y, i8* %x, <8 x i16> %a) {
 ; CHECK-BE-NEXT:    vmov d0, r3, r2
 ; CHECK-BE-NEXT:    vrev64.16 q2, q0
 ; CHECK-BE-NEXT:    vpt.s16 gt, q2, zr
-; CHECK-BE-NEXT:    vstrht.16 q1, [r0]
-; CHECK-BE-NEXT:    adds r0, #4
+; CHECK-BE-NEXT:    vstrht.16 q1, [r0], #4
 ; CHECK-BE-NEXT:    bx lr
 entry:
   %z = getelementptr inbounds i8, i8* %y, i32 4
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
index 22483aac109e8..79700e046f0ef 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll
@@ -2609,3 +2609,2614 @@ entry:
   %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
   ret <8 x half> %s
 }
+
+
+; Reversed
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_oeq_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oeq_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oeq_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 eq, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp oeq <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_one_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_one_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r3, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_one_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp one <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ogt_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ogt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ogt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 lt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ogt <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_oge_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oge_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oge_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp oge <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_olt_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_olt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_olt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp olt <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ole_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ole_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ole_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ole <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ueq_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ueq_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r3, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ueq_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ueq <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_une_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 ne, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp une <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ugt_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ugt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ugt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ugt <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_uge_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uge_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uge_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp uge <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ult_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ult_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ult_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ult <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ule_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ule_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ule_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vcmp.f32 lt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ule <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ord_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ord_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ord_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp ord <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_uno_v4f32(<4 x float> %src, float %src2, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uno_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s2
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s4, s3
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s15, s11
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s14, s10
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s13, s9
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s12, s8
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uno_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vmov r0, s4
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q2, q3
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x float> undef, float %src2, i32 0
+  %sp = shufflevector <4 x float> %i, <4 x float> undef, <4 x i32> zeroinitializer
+  %c = fcmp uno <4 x float> %sp, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 eq, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp oeq <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f16 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp one <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 lt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ogt <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp oge <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp olt <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ole <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f16 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ueq <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 ne, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp une <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 ge, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ugt <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp uge <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ult <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vcmp.f16 lt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ule <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f16 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp ord <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half* %src2p, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9, d10}
+; CHECK-MVE-NEXT:    vpush {d8, d9, d10}
+; CHECK-MVE-NEXT:    vldr.16 s16, [r0]
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    movs r2, #0
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s9
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r0, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s9, s5
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s5
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s20, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s10, s6
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s18
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s6
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s20, s18
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s18, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    vmov r0, s18
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9, d10}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vldr.16 s12, [r0]
+; CHECK-MVEFP-NEXT:    vmov r0, s12
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, r0
+; CHECK-MVEFP-NEXT:    vcmpt.f16 gt, q0, r0
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %src2 = load half, half* %src2p
+  %i = insertelement <8 x half> undef, half %src2, i32 0
+  %sp = shufflevector <8 x half> %i, <8 x half> undef, <8 x i32> zeroinitializer
+  %c = fcmp uno <8 x half> %sp, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
index 6aae7e7665a10..82ef5df349aaf 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfz.ll
@@ -107,9 +107,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_one_v4f32(<4 x float> %src, <4 x float>
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v4f32:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f32 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, zr
 ; CHECK-MVEFP-NEXT:    vpnot
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
@@ -380,9 +379,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ueq_v4f32(<4 x float> %src, <4 x float>
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v4f32:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f32 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, zr
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
 entry:
@@ -698,9 +696,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ord_v4f32(<4 x float> %src, <4 x float>
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v4f32:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f32 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, zr
 ; CHECK-MVEFP-NEXT:    vpnot
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
@@ -753,9 +750,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_uno_v4f32(<4 x float> %src, <4 x float>
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v4f32:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f32 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, zr
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
 entry:
@@ -1013,9 +1009,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, <8 x half> %a
 ;
 ; CHECK-MVEFP-LABEL: vcmp_one_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f16 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f16 le, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, zr
 ; CHECK-MVEFP-NEXT:    vpnot
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
@@ -1632,9 +1627,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, <8 x half> %a
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f16 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f16 le, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 le, q0, zr
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
 entry:
@@ -2358,9 +2352,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, <8 x half> %a
 ;
 ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f16 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f16 lt, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, zr
 ; CHECK-MVEFP-NEXT:    vpnot
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
@@ -2481,9 +2474,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, <8 x half> %a
 ;
 ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16:
 ; CHECK-MVEFP:       @ %bb.0: @ %entry
-; CHECK-MVEFP-NEXT:    vmov.i32 q3, #0x0
-; CHECK-MVEFP-NEXT:    vpt.f16 le, q3, q0
-; CHECK-MVEFP-NEXT:    vcmpt.f16 lt, q0, q3
+; CHECK-MVEFP-NEXT:    vpt.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 lt, q0, zr
 ; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
 ; CHECK-MVEFP-NEXT:    bx lr
 entry:
@@ -2491,3 +2483,2488 @@ entry:
   %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
   ret <8 x half> %s
 }
+
+
+; Reversed
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_oeq_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oeq_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oeq_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 eq, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp oeq <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_one_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_one_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r3, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_one_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp one <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ogt_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ogt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ogt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 lt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ogt <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_oge_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oge_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oge_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp oge <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_olt_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_olt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_olt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp olt <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ole_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ole_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ole_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ole <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ueq_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ueq_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r3, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ueq_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ueq <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_une_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_une_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_une_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 ne, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp une <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ugt_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ugt_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ugt_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ugt <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_uge_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uge_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uge_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp uge <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ult_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ult_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ult_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ult <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ule_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ule_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, #0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ule_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f32 lt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ule <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_ord_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ord_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, s2
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ord_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ord <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+define arm_aapcs_vfpcc <4 x float> @vcmp_r_uno_v4f32(<4 x float> %src, <4 x float> %a, <4 x float> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uno_v4f32:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    vcmp.f32 s0, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s1, s1
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s2, s2
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r3, #0
+; CHECK-MVE-NEXT:    vcmp.f32 s3, s3
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r3, #1
+; CHECK-MVE-NEXT:    cmp r3, #0
+; CHECK-MVE-NEXT:    cset r3, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s3, s11, s7
+; CHECK-MVE-NEXT:    lsls r0, r3, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s2, s10, s6
+; CHECK-MVE-NEXT:    lsls r0, r2, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s1, s9, s5
+; CHECK-MVE-NEXT:    lsls r0, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f32 s0, s8, s4
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uno_v4f32:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f32 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp uno <4 x float> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %s
+}
+
+
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oeq_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 eq, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp oeq <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_one_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp one <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ogt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it mi
+; CHECK-MVE-NEXT:    movmi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 lt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ogt <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_oge_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ls
+; CHECK-MVE-NEXT:    movls r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp oge <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_olt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it gt
+; CHECK-MVE-NEXT:    movgt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp olt <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ole_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ge
+; CHECK-MVE-NEXT:    movge r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ole <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ueq_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r2, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    mov.w r0, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r1, #1
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it eq
+; CHECK-MVE-NEXT:    moveq r0, #1
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ueq <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_une_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it ne
+; CHECK-MVE-NEXT:    movne r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 ne, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp une <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ugt_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it lt
+; CHECK-MVE-NEXT:    movlt r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 ge, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ugt <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uge_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it le
+; CHECK-MVE-NEXT:    movle r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp uge <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ult_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it hi
+; CHECK-MVE-NEXT:    movhi r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ult <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ule_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, #0
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, #0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it pl
+; CHECK-MVE-NEXT:    movpl r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vcmp.f16 lt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ule <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_ord_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vc
+; CHECK-MVE-NEXT:    movvc r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpnot
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp ord <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
+
+define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, <8 x half> %a, <8 x half> %b) {
+; CHECK-MVE-LABEL: vcmp_r_uno_v8f16:
+; CHECK-MVE:       @ %bb.0: @ %entry
+; CHECK-MVE-NEXT:    .vsave {d8, d9}
+; CHECK-MVE-NEXT:    vpush {d8, d9}
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s0
+; CHECK-MVE-NEXT:    movs r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s12, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s12, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s14, s8
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    mov.w r2, #0
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s14, s12
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r2, #1
+; CHECK-MVE-NEXT:    cmp r2, #0
+; CHECK-MVE-NEXT:    cset r2, ne
+; CHECK-MVE-NEXT:    vmov r1, s12
+; CHECK-MVE-NEXT:    lsls r2, r2, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s1, s1
+; CHECK-MVE-NEXT:    vseleq.f16 s12, s8, s4
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r2, s12
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s9
+; CHECK-MVE-NEXT:    vmov.16 q3[0], r2
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s3
+; CHECK-MVE-NEXT:    vmov.16 q3[1], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    movs r0, #0
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s9, s5
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s1
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmov.16 q3[2], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s5
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s2, s2
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s18, s10
+; CHECK-MVE-NEXT:    vmov.16 q3[3], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s10, s6
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s2
+; CHECK-MVE-NEXT:    vcmp.f16 s16, s16
+; CHECK-MVE-NEXT:    vmov.16 q3[4], r1
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s16, s6
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vcmp.f16 s3, s3
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s18, s16
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    vcmp.f16 s0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[5], r1
+; CHECK-MVE-NEXT:    mov.w r1, #0
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r1, #1
+; CHECK-MVE-NEXT:    cmp r1, #0
+; CHECK-MVE-NEXT:    cset r1, ne
+; CHECK-MVE-NEXT:    vmovx.f16 s0, s7
+; CHECK-MVE-NEXT:    lsls r1, r1, #31
+; CHECK-MVE-NEXT:    vmovx.f16 s2, s11
+; CHECK-MVE-NEXT:    vseleq.f16 s16, s11, s7
+; CHECK-MVE-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-MVE-NEXT:    it vs
+; CHECK-MVE-NEXT:    movvs r0, #1
+; CHECK-MVE-NEXT:    cmp r0, #0
+; CHECK-MVE-NEXT:    cset r0, ne
+; CHECK-MVE-NEXT:    vmov r1, s16
+; CHECK-MVE-NEXT:    lsls r0, r0, #31
+; CHECK-MVE-NEXT:    vmov.16 q3[6], r1
+; CHECK-MVE-NEXT:    vseleq.f16 s0, s2, s0
+; CHECK-MVE-NEXT:    vmov r0, s0
+; CHECK-MVE-NEXT:    vmov.16 q3[7], r0
+; CHECK-MVE-NEXT:    vmov q0, q3
+; CHECK-MVE-NEXT:    vpop {d8, d9}
+; CHECK-MVE-NEXT:    bx lr
+;
+; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16:
+; CHECK-MVEFP:       @ %bb.0: @ %entry
+; CHECK-MVEFP-NEXT:    vpt.f16 le, q0, zr
+; CHECK-MVEFP-NEXT:    vcmpt.f32 gt, q0, zr
+; CHECK-MVEFP-NEXT:    vpsel q0, q1, q2
+; CHECK-MVEFP-NEXT:    bx lr
+entry:
+  %c = fcmp uno <8 x half> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x half> %a, <8 x half> %b
+  ret <8 x half> %s
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
index a6fc2dbe4a4c8..c832c241e67cc 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpr.ll
@@ -591,3 +591,596 @@ define arm_aapcs_vfpcc <2 x i32> @vcmp_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <
   %a11 = select <2 x i1> %a10, <2 x i32> %c, <2 x i32> %a5
   ret <2 x i32> %a11
 }
+
+; Reversed
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_eq_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_eq_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i32 eq, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp eq <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ne_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ne_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i32 ne, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp ne <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sgt_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sgt_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 lt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp sgt <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sge_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sge_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 le, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp sge <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_slt_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_slt_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 gt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp slt <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sle_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sle_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 ge, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp sle <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ugt_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ugt_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q3, r0
+; CHECK-NEXT:    vcmp.u32 hi, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp ugt <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_uge_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_uge_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.32 q3, r0
+; CHECK-NEXT:    vcmp.u32 cs, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp uge <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ult_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ult_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u32 hi, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp ult <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ule_v4i32(<4 x i32> %src, i32 %src2, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ule_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u32 cs, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <4 x i32> undef, i32 %src2, i32 0
+  %sp = shufflevector <4 x i32> %i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %c = icmp ule <4 x i32> %sp, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_eq_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_eq_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i16 eq, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp eq <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ne_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ne_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i16 ne, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp ne <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sgt_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sgt_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 lt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp sgt <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sge_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sge_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 le, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp sge <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_slt_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_slt_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 gt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp slt <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sle_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sle_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 ge, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp sle <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ugt_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ugt_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q3, r0
+; CHECK-NEXT:    vcmp.u16 hi, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp ugt <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_uge_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_uge_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.16 q3, r0
+; CHECK-NEXT:    vcmp.u16 cs, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp uge <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ult_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ult_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u16 hi, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp ult <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ule_v8i16(<8 x i16> %src, i16 %src2, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ule_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u16 cs, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <8 x i16> undef, i16 %src2, i32 0
+  %sp = shufflevector <8 x i16> %i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %c = icmp ule <8 x i16> %sp, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_eq_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_eq_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i8 eq, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp eq <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ne_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ne_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i8 ne, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp ne <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sgt_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sgt_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 lt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp sgt <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sge_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sge_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 le, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp sge <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_slt_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_slt_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 gt, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp slt <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sle_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sle_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 ge, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp sle <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ugt_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ugt_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.8 q3, r0
+; CHECK-NEXT:    vcmp.u8 hi, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp ugt <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_uge_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_uge_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vdup.8 q3, r0
+; CHECK-NEXT:    vcmp.u8 cs, q3, q0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp uge <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ult_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ult_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u8 hi, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp ult <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ule_v16i8(<16 x i8> %src, i8 %src2, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ule_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u8 cs, q0, r0
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <16 x i8> undef, i8 %src2, i32 0
+  %sp = shufflevector <16 x i8> %i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %c = icmp ule <16 x i8> %sp, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+
+define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eq_v2i64(<2 x i64> %src, i64 %src2, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vcmp_r_eq_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r2, r1
+; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <2 x i64> undef, i64 %src2, i32 0
+  %sp = shufflevector <2 x i64> %i, <2 x i64> undef, <2 x i32> zeroinitializer
+  %c = icmp eq <2 x i64> %sp, %src
+  %s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %s
+}
+
+define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eq_v2i32(<2 x i64> %src, i64 %src2, <2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vcmp_r_eq_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r2, s1
+; CHECK-NEXT:    vmov r3, s0
+; CHECK-NEXT:    eors r2, r1
+; CHECK-NEXT:    eors r3, r0
+; CHECK-NEXT:    orrs r2, r3
+; CHECK-NEXT:    cset r2, eq
+; CHECK-NEXT:    tst.w r2, #1
+; CHECK-NEXT:    csetm r2, ne
+; CHECK-NEXT:    vmov.32 q3[0], r2
+; CHECK-NEXT:    vmov.32 q3[1], r2
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    eors r1, r2
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    eors r0, r2
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %i = insertelement <2 x i64> undef, i64 %src2, i32 0
+  %sp = shufflevector <2 x i64> %i, <2 x i64> undef, <2 x i32> zeroinitializer
+  %c = icmp eq <2 x i64> %sp, %src
+  %s = select <2 x i1> %c, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %s
+}
+
+define arm_aapcs_vfpcc <2 x i32> @vcmp_r_multi_v2i32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: vcmp_r_multi_v2i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    movs r3, #0
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov r2, s8
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vmov lr, s0
+; CHECK-NEXT:    subs.w r1, lr, r2
+; CHECK-NEXT:    asr.w r12, lr, #31
+; CHECK-NEXT:    sbcs.w r1, r12, r2, asr #31
+; CHECK-NEXT:    mov.w r1, #0
+; CHECK-NEXT:    vmov r2, s10
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r1, #1
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    csetm r1, ne
+; CHECK-NEXT:    vmov.32 q3[0], r1
+; CHECK-NEXT:    vmov.32 q3[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    subs r0, r1, r2
+; CHECK-NEXT:    asr.w r12, r1, #31
+; CHECK-NEXT:    sbcs.w r0, r12, r2, asr #31
+; CHECK-NEXT:    it lt
+; CHECK-NEXT:    movlt r3, #1
+; CHECK-NEXT:    cmp r3, #0
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    cmp.w lr, #0
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    cmp r1, #0
+; CHECK-NEXT:    vmov.32 q4[0], r0
+; CHECK-NEXT:    vmov.32 q4[1], r0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q4[2], r0
+; CHECK-NEXT:    vmov.32 q4[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[0], r0
+; CHECK-NEXT:    vmov.32 q5[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    cset r0, ne
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q5[2], r0
+; CHECK-NEXT:    vmov.32 q5[3], r0
+; CHECK-NEXT:    vand q1, q5, q4
+; CHECK-NEXT:    vand q1, q3, q1
+; CHECK-NEXT:    vbic q0, q0, q1
+; CHECK-NEXT:    vand q1, q2, q1
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    pop {r7, pc}
+  %a4 = icmp eq <2 x i64> %a, zeroinitializer
+  %a5 = select <2 x i1> %a4, <2 x i32> zeroinitializer, <2 x i32> %c
+  %a6 = icmp ne <2 x i32> %b, zeroinitializer
+  %a7 = icmp slt <2 x i32> %a5, %c
+  %a8 = icmp ne <2 x i32> %a5, zeroinitializer
+  %a9 = and <2 x i1> %a6, %a8
+  %a10 = and <2 x i1> %a7, %a9
+  %a11 = select <2 x i1> %a10, <2 x i32> %c, <2 x i32> %a5
+  ret <2 x i32> %a11
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
index 142511b10d6af..6d08abc723021 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vcmpz.ll
@@ -415,3 +415,421 @@ entry:
   %s = select <2 x i1> %c, <2 x i32> %a, <2 x i32> %b
   ret <2 x i32> %s
 }
+
+
+; Reversed
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_eqz_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_eqz_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i32 eq, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp eq <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_nez_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_nez_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ne <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sgtz_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sgtz_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 lt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sgt <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sgez_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sgez_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 le, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sge <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_sltz_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_sltz_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 gt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp slt <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_slez_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_slez_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s32 ge, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sle <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ugtz_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ugtz_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ugt <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ugez_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ugez_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u32 cs, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp uge <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ultz_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ultz_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i32 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ult <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+define arm_aapcs_vfpcc <4 x i32> @vcmp_r_ulez_v4i32(<4 x i32> %src, <4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vcmp_r_ulez_v4i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ule <4 x i32> zeroinitializer, %src
+  %s = select <4 x i1> %c, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %s
+}
+
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_eqz_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_eqz_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i16 eq, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp eq <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_nez_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_nez_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ne <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sgtz_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sgtz_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 lt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sgt <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sgez_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sgez_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 le, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sge <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_sltz_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_sltz_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 gt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp slt <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_slez_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_slez_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s16 ge, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sle <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ugtz_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ugtz_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ugt <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ugez_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ugez_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u16 cs, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp uge <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ultz_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ultz_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ult <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+define arm_aapcs_vfpcc <8 x i16> @vcmp_r_ulez_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: vcmp_r_ulez_v8i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ule <8 x i16> zeroinitializer, %src
+  %s = select <8 x i1> %c, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %s
+}
+
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_eqz_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_eqz_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i8 eq, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp eq <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_nez_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_nez_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i8 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ne <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sgtz_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sgtz_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 lt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sgt <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sgez_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sgez_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 le, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sge <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_sltz_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_sltz_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 gt, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp slt <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_slez_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_slez_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.s8 ge, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp sle <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ugtz_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ugtz_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ugt <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ugez_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ugez_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.u8 cs, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp uge <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ultz_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ultz_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmp.i8 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ult <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+define arm_aapcs_vfpcc <16 x i8> @vcmp_r_ulez_v16i8(<16 x i8> %src, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: vcmp_r_ulez_v16i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp ule <16 x i8> zeroinitializer, %src
+  %s = select <16 x i1> %c, <16 x i8> %a, <16 x i8> %b
+  ret <16 x i8> %s
+}
+
+
+define arm_aapcs_vfpcc <2 x i64> @vcmp_r_eqz_v2i64(<2 x i64> %src, <2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: vcmp_r_eqz_v2i64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp eq <2 x i64> zeroinitializer, %src
+  %s = select <2 x i1> %c, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %s
+}
+
+define arm_aapcs_vfpcc <2 x i32> @vcmp_r_eqz_v2i32(<2 x i64> %src, <2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: vcmp_r_eqz_v2i32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, s1
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[0], r0
+; CHECK-NEXT:    vmov.32 q3[1], r0
+; CHECK-NEXT:    vmov r0, s3
+; CHECK-NEXT:    orrs r0, r1
+; CHECK-NEXT:    cset r0, eq
+; CHECK-NEXT:    tst.w r0, #1
+; CHECK-NEXT:    csetm r0, ne
+; CHECK-NEXT:    vmov.32 q3[2], r0
+; CHECK-NEXT:    vmov.32 q3[3], r0
+; CHECK-NEXT:    vbic q0, q2, q3
+; CHECK-NEXT:    vand q1, q1, q3
+; CHECK-NEXT:    vorr q0, q1, q0
+; CHECK-NEXT:    bx lr
+entry:
+  %c = icmp eq <2 x i64> %src, zeroinitializer
+  %s = select <2 x i1> %c, <2 x i32> %a, <2 x i32> %b
+  ret <2 x i32> %s
+}
diff --git a/llvm/test/CodeGen/Thumb2/mve-vctp.ll b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
index 8f7e1696e6790..d6e4d492f5351 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vctp.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vctp.ll
@@ -10,7 +10,7 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]
 ; CHECK-NEXT:    bx lr
-  %pred = call <16 x i1> @llvm.arm.vctp8(i32 %arg)
+  %pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg)
   %ld = load <16 x i8>, <16 x i8>* %in
   %res = select <16 x i1> %pred, <16 x i8> %ld, <16 x i8> zeroinitializer
   store <16 x i8> %res, <16 x i8>* %out
@@ -26,7 +26,7 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]
 ; CHECK-NEXT:    bx lr
-  %pred = call <8 x i1> @llvm.arm.vctp16(i32 %arg)
+  %pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg)
   %ld = load <8 x i16>, <8 x i16>* %in
   %res = select <8 x i1> %pred, <8 x i16> %ld, <8 x i16> zeroinitializer
   store <8 x i16> %res, <8 x i16>* %out
@@ -42,13 +42,13 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
 ; CHECK-NEXT:    vpsel q0, q1, q0
 ; CHECK-NEXT:    vstrw.32 q0, [r2]
 ; CHECK-NEXT:    bx lr
-  %pred = call <4 x i1> @llvm.arm.vctp32(i32 %arg)
+  %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg)
   %ld = load <4 x i32>, <4 x i32>* %in
   %res = select <4 x i1> %pred, <4 x i32> %ld, <4 x i32> zeroinitializer
   store <4 x i32> %res, <4 x i32>* %out
   ret void
 }
 
-declare <16 x i1> @llvm.arm.vctp8(i32)
-declare <8 x i1> @llvm.arm.vctp16(i32)
-declare <4 x i1> @llvm.arm.vctp32(i32)
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
+declare <4 x i1> @llvm.arm.mve.vctp32(i32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll b/llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll
index 19979f203f16b..83534e2c3e833 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vhaddsub.ll
@@ -4,7 +4,8 @@
 define arm_aapcs_vfpcc <16 x i8> @add_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: add_ashr_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.s8 q0, q0, q1
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.s8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <16 x i8> %src1, %src2
@@ -15,7 +16,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @add_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: add_ashr_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.s16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <8 x i16> %src1, %src2
@@ -26,7 +28,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @add_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: add_ashr_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.s32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.s32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add nsw <4 x i32> %src1, %src2
@@ -37,7 +40,8 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @add_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: add_lshr_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u8 q0, q0, q1
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.u8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <16 x i8> %src1, %src2
@@ -48,7 +52,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @add_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: add_lshr_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <8 x i16> %src1, %src2
@@ -59,7 +64,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @add_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: add_lshr_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add nsw <4 x i32> %src1, %src2
@@ -70,7 +76,8 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @sub_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: sub_ashr_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.s8 q0, q0, q1
+; CHECK-NEXT:    vsub.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.s8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <16 x i8> %src1, %src2
@@ -81,7 +88,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @sub_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: sub_ashr_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.s16 q0, q0, q1
+; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <8 x i16> %src1, %src2
@@ -92,7 +100,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sub_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: sub_ashr_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.s32 q0, q0, q1
+; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.s32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <4 x i32> %src1, %src2
@@ -103,7 +112,8 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @sub_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: sub_lshr_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u8 q0, q0, q1
+; CHECK-NEXT:    vsub.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.u8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <16 x i8> %src1, %src2
@@ -114,7 +124,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @sub_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: sub_lshr_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u16 q0, q0, q1
+; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <8 x i16> %src1, %src2
@@ -125,7 +136,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sub_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: sub_lshr_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u32 q0, q0, q1
+; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <4 x i32> %src1, %src2
@@ -140,7 +152,8 @@ define arm_aapcs_vfpcc <16 x i8> @add_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.i8 q0, q0, q1
 ; CHECK-NEXT:    vshr.u8 q1, q0, #7
-; CHECK-NEXT:    vhadd.s8 q0, q0, q1
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.s8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <16 x i8> %src1, %src2
@@ -153,7 +166,8 @@ define arm_aapcs_vfpcc <8 x i16> @add_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.i16 q0, q0, q1
 ; CHECK-NEXT:    vshr.u16 q1, q0, #15
-; CHECK-NEXT:    vhadd.s16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <8 x i16> %src1, %src2
@@ -166,7 +180,8 @@ define arm_aapcs_vfpcc <4 x i32> @add_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vadd.i32 q0, q0, q1
 ; CHECK-NEXT:    vshr.u32 q1, q0, #31
-; CHECK-NEXT:    vhadd.s32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.s32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add nsw <4 x i32> %src1, %src2
@@ -177,7 +192,8 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @add_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: add_udiv_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u8 q0, q0, q1
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.u8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <16 x i8> %src1, %src2
@@ -188,7 +204,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @add_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: add_udiv_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add <8 x i16> %src1, %src2
@@ -199,7 +216,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @add_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: add_udiv_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhadd.u32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = add nsw <4 x i32> %src1, %src2
@@ -212,7 +230,8 @@ define arm_aapcs_vfpcc <16 x i8> @sub_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vsub.i8 q0, q0, q1
 ; CHECK-NEXT:    vshr.u8 q1, q0, #7
-; CHECK-NEXT:    vhadd.s8 q0, q0, q1
+; CHECK-NEXT:    vadd.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.s8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <16 x i8> %src1, %src2
@@ -225,7 +244,8 @@ define arm_aapcs_vfpcc <8 x i16> @sub_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vsub.i16 q0, q0, q1
 ; CHECK-NEXT:    vshr.u16 q1, q0, #15
-; CHECK-NEXT:    vhadd.s16 q0, q0, q1
+; CHECK-NEXT:    vadd.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.s16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <8 x i16> %src1, %src2
@@ -238,7 +258,8 @@ define arm_aapcs_vfpcc <4 x i32> @sub_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src
 ; CHECK:       @ %bb.0: @ %entry
 ; CHECK-NEXT:    vsub.i32 q0, q0, q1
 ; CHECK-NEXT:    vshr.u32 q1, q0, #31
-; CHECK-NEXT:    vhadd.s32 q0, q0, q1
+; CHECK-NEXT:    vadd.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.s32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <4 x i32> %src1, %src2
@@ -249,7 +270,8 @@ entry:
 define arm_aapcs_vfpcc <16 x i8> @sub_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) {
 ; CHECK-LABEL: sub_udiv_v16i8:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u8 q0, q0, q1
+; CHECK-NEXT:    vsub.i8 q0, q0, q1
+; CHECK-NEXT:    vshr.u8 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <16 x i8> %src1, %src2
@@ -260,7 +282,8 @@ entry:
 define arm_aapcs_vfpcc <8 x i16> @sub_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) {
 ; CHECK-LABEL: sub_udiv_v8i16:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u16 q0, q0, q1
+; CHECK-NEXT:    vsub.i16 q0, q0, q1
+; CHECK-NEXT:    vshr.u16 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub <8 x i16> %src1, %src2
@@ -271,7 +294,8 @@ entry:
 define arm_aapcs_vfpcc <4 x i32> @sub_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) {
 ; CHECK-LABEL: sub_udiv_v4i32:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    vhsub.u32 q0, q0, q1
+; CHECK-NEXT:    vsub.i32 q0, q0, q1
+; CHECK-NEXT:    vshr.u32 q0, q0, #1
 ; CHECK-NEXT:    bx lr
 entry:
   %0 = sub nsw <4 x i32> %src1, %src2
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
index c7533503fa777..e6e7de6109431 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-from-intrinsics.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -instcombine %s | llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve --verify-machineinstrs -o - | FileCheck %s
 
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
 define arm_aapcs_vfpcc <8 x i16> @test_vpt_block(<8 x i16> %v_inactive, <8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
 ; CHECK-LABEL: test_vpt_block:
 ; CHECK:       @ %bb.0: @ %entry
@@ -16,7 +18,27 @@ entry:
   ret <8 x i16> %5
 }
 
+define arm_aapcs_vfpcc <8 x i16> @test_vpnot(<8 x i16> %v, <8 x i16> %w, <8 x i16> %x, i32 %n) {
+; CHECK-LABEL: test_vpnot:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vctp.16 r0
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vaddt.i16 q0, q1, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
+  %1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
+  %2 = trunc i32 %1 to i16
+  %3 = xor i16 %2, -1
+  %4 = zext i16 %3 to i32
+  %5 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %4)
+  %6 = call <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16> %w, <8 x i16> %x, <8 x i1> %5, <8 x i16> %v)
+  ret <8 x i16> %6
+}
+
 declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
 declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
 declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
+declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 
diff --git a/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
index c24845edbddb1..849cf0d4ce4cf 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/ashr-scalar.ll
@@ -165,7 +165,7 @@ define i1 @test_ashr_i1_imm1(i32 %arg1) {
 ; X64-LABEL: test_ashr_i1_imm1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movb $-1, %cl
+; X64-NEXT:    movb $1, %cl
 ; X64-NEXT:    shlb $7, %al
 ; X64-NEXT:    sarb $7, %al
 ; X64-NEXT:    andb $1, %cl
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-constant.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-constant.mir
index 3b4bec6978f74..b89116e3a6cf7 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-constant.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-constant.mir
@@ -18,7 +18,7 @@ registers:
 body:             |
   bb.1 (%ir-block.0):
     ; X32-LABEL: name: test_constant
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X32: $eax = COPY [[C]](s32)
     ; X32: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 8
     ; X32: $al = COPY [[C1]](s8)
@@ -32,7 +32,7 @@ body:             |
     ; X32: $rax = COPY [[MV]](s64)
     ; X32: RET 0
     ; X64-LABEL: name: test_constant
-    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X64: $eax = COPY [[C]](s32)
     ; X64: [[C1:%[0-9]+]]:_(s8) = G_CONSTANT i8 8
     ; X64: $al = COPY [[C1]](s8)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
index e935c1ca04bbb..5dd53751247f7 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
@@ -164,7 +164,7 @@ define i1 @test_lshr_i1_imm1(i32 %arg1) {
 ; X64-LABEL: test_lshr_i1_imm1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movb $-1, %cl
+; X64-NEXT:    movb $1, %cl
 ; X64-NEXT:    andb $1, %al
 ; X64-NEXT:    andb $1, %cl
 ; X64-NEXT:    shrb %cl, %al
diff --git a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
index 49aa99e01c6ce..5ccc0eee59512 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/shl-scalar.ll
@@ -162,7 +162,7 @@ define i1 @test_shl_i1_imm1(i32 %arg1) {
 ; X64-LABEL: test_shl_i1_imm1:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    movb $-1, %cl
+; X64-NEXT:    movb $1, %cl
 ; X64-NEXT:    andb $1, %cl
 ; X64-NEXT:    shlb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index 6f7247388640a..e6b43c07fe056 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -25,25 +25,25 @@ define <8 x double> @sltof864(<8 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; NODQ-NEXT:    retq
@@ -69,12 +69,12 @@ define <4 x double> @slto4f64(<4 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; NODQ-NEXT:    retq
 ;
@@ -100,7 +100,7 @@ define <2 x double> @slto2f64(<2 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; NODQ-NEXT:    retq
 ;
 ; VLDQ-LABEL: slto2f64:
@@ -140,7 +140,7 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; VLNODQ-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
 ; VLNODQ-NEXT:    vmovq %xmm0, %rax
 ; VLNODQ-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; VLNODQ-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; VLNODQ-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; VLNODQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; VLNODQ-NEXT:    retq
 ;
@@ -1040,13 +1040,13 @@ define <16 x float> @slto16f32(<16 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2ss %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
@@ -1094,25 +1094,25 @@ define <8 x double> @slto8f64(<8 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; NODQ-NEXT:    vmovq %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm3
 ; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
 ; NODQ-NEXT:    retq
@@ -1138,25 +1138,25 @@ define <16 x double> @slto16f64(<16 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm4, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vmovq %xmm0, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm0
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
 ; NODQ-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
@@ -1164,25 +1164,25 @@ define <16 x double> @slto16f64(<16 x i64> %a) {
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm2
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
 ; NODQ-NEXT:    vextracti32x4 $2, %zmm1, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; NODQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; NODQ-NEXT:    vpextrq $1, %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
 ; NODQ-NEXT:    vpextrq $1, %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vmovq %xmm1, %rax
 ; NODQ-NEXT:    vcvtsi2sd %rax, %xmm5, %xmm1
-; NODQ-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
+; NODQ-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; NODQ-NEXT:    vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
 ; NODQ-NEXT:    retq
@@ -1275,13 +1275,13 @@ define <16 x float> @ulto16f32(<16 x i64> %a) {
 ; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm1
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
 ; NODQ-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
 ; NODQ-NEXT:    vpextrq $1, %xmm2, %rax
 ; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm3
 ; NODQ-NEXT:    vmovq %xmm2, %rax
 ; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm2
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
+; NODQ-NEXT:    vextractf32x4 $3, %zmm0, %xmm3
 ; NODQ-NEXT:    vmovq %xmm3, %rax
 ; NODQ-NEXT:    vcvtusi2ss %rax, %xmm5, %xmm4
 ; NODQ-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll
index acc4b7e138118..258cc2031ae8b 100644
--- a/llvm/test/CodeGen/X86/block-placement.ll
+++ b/llvm/test/CodeGen/X86/block-placement.ll
@@ -358,11 +358,11 @@ define void @unnatural_cfg2(i32* %p0, i32 %a0) {
 ; CHECK: %loop.header
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
 ; CHECK: %loop.body4
 ; CHECK: %loop.inner2.begin
 ; CHECK: %loop.inner2.begin
-; CHECK: %loop.body3
-; CHECK: %loop.inner1.begin
 ; CHECK: %bail
 
 entry:
diff --git a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
index 2e39fb976c752..aca23b032708d 100644
--- a/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
+++ b/llvm/test/CodeGen/X86/copysign-constant-magnitude.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-apple-macosx10.10.0 < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
@@ -12,9 +13,8 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 define double @mag_pos0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK1]](%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call double @copysign(double 0.0, double %x)
   ret double %y
 }
@@ -25,10 +25,9 @@ define double @mag_pos0_double(double %x) nounwind {
 define double @mag_neg0_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg0_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movsd [[SIGNMASK2]](%rip), %xmm1
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call double @copysign(double -0.0, double %x)
   ret double %y
 }
@@ -42,11 +41,10 @@ define double @mag_neg0_double(double %x) nounwind {
 define double @mag_pos1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_pos1_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK3]](%rip), %xmm0
-; CHECK-NEXT:    movsd [[ONE3]](%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call double @copysign(double 1.0, double %x)
   ret double %y
 }
@@ -61,10 +59,9 @@ define double @mag_pos1_double(double %x) nounwind {
 define double @mag_neg1_double(double %x) nounwind {
 ; CHECK-LABEL: mag_neg1_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK4]](%rip), %xmm0
-; CHECK-NEXT:    orps [[ONE4]](%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call double @copysign(double -1.0, double %x)
   ret double %y
 }
@@ -78,9 +75,8 @@ define double @mag_neg1_double(double %x) nounwind {
 define float @mag_pos0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK5]](%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call float @copysignf(float 0.0, float %x)
   ret float %y
 }
@@ -91,10 +87,9 @@ define float @mag_pos0_float(float %x) nounwind {
 define float @mag_neg0_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg0_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movss [[SIGNMASK6]](%rip), %xmm1
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call float @copysignf(float -0.0, float %x)
   ret float %y
 }
@@ -110,11 +105,10 @@ define float @mag_neg0_float(float %x) nounwind {
 define float @mag_pos1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_pos1_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK7]](%rip), %xmm0
-; CHECK-NEXT:    movss [[ONE7]](%rip), %xmm1
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call float @copysignf(float 1.0, float %x)
   ret float %y
 }
@@ -133,10 +127,9 @@ define float @mag_pos1_float(float %x) nounwind {
 define float @mag_neg1_float(float %x) nounwind {
 ; CHECK-LABEL: mag_neg1_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andps [[SIGNMASK8]](%rip), %xmm0
-; CHECK-NEXT:    orps [[ONE8]](%rip), %xmm0
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps {{.*}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-;
   %y = call float @copysignf(float -1.0, float %x)
   ret float %y
 }
diff --git a/llvm/test/CodeGen/X86/domain-reassignment.mir b/llvm/test/CodeGen/X86/domain-reassignment.mir
index 38755344849a0..e24a5ded09a00 100644
--- a/llvm/test/CodeGen/X86/domain-reassignment.mir
+++ b/llvm/test/CodeGen/X86/domain-reassignment.mir
@@ -167,7 +167,7 @@ body:             |
   bb.1.if:
     successors: %bb.3(0x80000000)
 
-    %14 = VCMPSSZrr %7, %8, 0
+    %14 = VCMPSSZrr %7, %8, 0, implicit $mxcsr
 
     ; check that cross domain copies are replaced with same domain copies.
 
@@ -177,7 +177,7 @@ body:             |
 
   bb.2.else:
     successors: %bb.3(0x80000000)
-    %12 = VCMPSSZrr %9, %10, 0
+    %12 = VCMPSSZrr %9, %10, 0, implicit $mxcsr
 
     ; check that cross domain copies are replaced with same domain copies.
 
@@ -292,7 +292,7 @@ body:             |
     %3 = COPY $zmm2
     %4 = COPY $zmm3
 
-    %5 = VCMPPDZrri %3, %4, 0
+    %5 = VCMPPDZrri %3, %4, 0, implicit $mxcsr
     %6 = COPY %5
     %7 = COPY %6.sub_8bit
 
@@ -411,7 +411,7 @@ body:             |
     %3 = COPY $zmm2
     %4 = COPY $zmm3
 
-    %5 = VCMPPSZrri %3, %4, 0
+    %5 = VCMPPSZrri %3, %4, 0, implicit $mxcsr
     %6 = COPY %5
     %7 = COPY %6.sub_16bit
 
diff --git a/llvm/test/CodeGen/X86/dwarf-headers.ll b/llvm/test/CodeGen/X86/dwarf-headers.ll
index ef626ad7003bb..6159fc29f8623 100644
--- a/llvm/test/CodeGen/X86/dwarf-headers.ll
+++ b/llvm/test/CodeGen/X86/dwarf-headers.ll
@@ -75,7 +75,7 @@
 ; O-5: .debug_info contents:
 ; O-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_skeleton abbr_offset
 ; O-5-SAME:        DWO_id = 0xccd7e58ef8bf4aa6
-; O-5: 0x00000014: DW_TAG_compile_unit
+; O-5: 0x00000014: DW_TAG_skeleton_unit 
 ;
 ; DWO-5: .debug_info.dwo contents:
 ; DWO-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
index eaf68b9bb210b..3b0fbcba43488 100755
--- a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -167,14 +167,14 @@ body: |
   $ymm0 = VPADDWZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VPADDWYrr                   $ymm0, $ymm1
   $ymm0 = VPADDWZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMULPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMULPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMULPDYrr                   $ymm0, $ymm1
-  $ymm0 = VMULPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMULPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMULPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMULPSYrr                   $ymm0, $ymm1
-  $ymm0 = VMULPSZ256rr                         $ymm0, $ymm1                                   
+  ; CHECK: $ymm0 = VMULPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMULPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMULPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMULPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMULPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMULPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMULPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMULPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
   ; CHECK: $ymm0 = VORPDYrm                    $ymm0, $rip, 1, $rax, 0, $noreg
   $ymm0 = VORPDZ256rm                          $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VORPDYrr                    $ymm0, $ymm1
@@ -315,14 +315,14 @@ body: |
   $ymm0 = VPXORQZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VPXORYrr                    $ymm0, $ymm1  
   $ymm0 = VPXORQZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VADDPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VADDPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VADDPDYrr                   $ymm0, $ymm1
-  $ymm0 = VADDPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VADDPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VADDPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VADDPSYrr                   $ymm0, $ymm1
-  $ymm0 = VADDPSZ256rr                         $ymm0, $ymm1                                   
+  ; CHECK: $ymm0 = VADDPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VADDPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VADDPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VADDPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VADDPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VADDPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VADDPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VADDPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
   ; CHECK: $ymm0 = VANDNPDYrm                  $ymm0, $rip, 1, $rax, 0, $noreg
   $ymm0 = VANDNPDZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VANDNPDYrr                  $ymm0, $ymm1
@@ -339,46 +339,46 @@ body: |
   $ymm0 = VANDPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VANDPSYrr                   $ymm0, $ymm1
   $ymm0 = VANDPSZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VDIVPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VDIVPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VDIVPDYrr                   $ymm0, $ymm1  
-  $ymm0 = VDIVPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VDIVPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VDIVPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VDIVPSYrr                   $ymm0, $ymm1
-  $ymm0 = VDIVPSZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMAXCPDYrm                  $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMAXCPDZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMAXCPDYrr                  $ymm0, $ymm1
-  $ymm0 = VMAXCPDZ256rr                        $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMAXCPSYrm                  $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMAXCPSZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMAXCPSYrr                  $ymm0, $ymm1
-  $ymm0 = VMAXCPSZ256rr                        $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMAXPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMAXPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMAXPDYrr                   $ymm0, $ymm1
-  $ymm0 = VMAXPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMAXPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMAXPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMAXPSYrr                   $ymm0, $ymm1
-  $ymm0 = VMAXPSZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMINCPDYrm                  $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMINCPDZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMINCPDYrr                  $ymm0, $ymm1
-  $ymm0 = VMINCPDZ256rr                        $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMINCPSYrm                  $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMINCPSZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMINCPSYrr                  $ymm0, $ymm1
-  $ymm0 = VMINCPSZ256rr                        $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMINPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMINPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMINPDYrr                   $ymm0, $ymm1
-  $ymm0 = VMINPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VMINPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VMINPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VMINPSYrr                   $ymm0, $ymm1
-  $ymm0 = VMINPSZ256rr                         $ymm0, $ymm1                                   
+  ; CHECK: $ymm0 = VDIVPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VDIVPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VDIVPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VDIVPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VDIVPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VDIVPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VDIVPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VDIVPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXCPDYrm                  $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMAXCPDZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXCPDYrr                  $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMAXCPDZ256rr                        $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXCPSYrm                  $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMAXCPSZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXCPSYrr                  $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMAXCPSZ256rr                        $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMAXPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMAXPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMAXPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMAXPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMAXPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINCPDYrm                  $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMINCPDZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINCPDYrr                  $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMINCPDZ256rr                        $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINCPSYrm                  $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMINCPSZ256rm                        $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINCPSYrr                  $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMINCPSZ256rr                        $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMINPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMINPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VMINPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VMINPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VMINPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
   ; CHECK: $ymm0 = VXORPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
   $ymm0 = VXORPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VXORPDYrr                   $ymm0, $ymm1
@@ -419,14 +419,14 @@ body: |
   $ymm0 = VUNPCKLPSZ256rm                      $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VUNPCKLPSYrr                $ymm0, $ymm1
   $ymm0 = VUNPCKLPSZ256rr                      $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VSUBPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VSUBPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VSUBPDYrr                   $ymm0, $ymm1 
-  $ymm0 = VSUBPDZ256rr                         $ymm0, $ymm1                                   
-  ; CHECK: $ymm0 = VSUBPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
-  $ymm0 = VSUBPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm0 = VSUBPSYrr                   $ymm0, $ymm1                               
-  $ymm0 = VSUBPSZ256rr                         $ymm0, $ymm1                                   
+  ; CHECK: $ymm0 = VSUBPDYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VSUBPDZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VSUBPDYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VSUBPDZ256rr                         $ymm0, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm0 = VSUBPSYrm                   $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm0 = VSUBPSZ256rm                         $ymm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VSUBPSYrr                   $ymm0, $ymm1, implicit $mxcsr
+  $ymm0 = VSUBPSZ256rr                         $ymm0, $ymm1, implicit $mxcsr
   ; CHECK: $ymm0 = VPUNPCKHBWYrm               $ymm0, $rip, 1, $rax, 0, $noreg
   $ymm0 = VPUNPCKHBWZ256rm                     $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VPUNPCKHBWYrr               $ymm0, $ymm1
@@ -459,150 +459,150 @@ body: |
   $ymm0 = VPUNPCKLWDZ256rm                     $ymm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm0 = VPUNPCKLWDYrr               $ymm0, $ymm1                               
   $ymm0 = VPUNPCKLWDZ256rr                     $ymm0, $ymm1                                                
-  ; CHECK: $ymm0 = VFMADD132PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD132PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD132PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD132PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADD132PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD132PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD132PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD132PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADD213PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD213PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD213PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD213PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADD213PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD213PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD213PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD213PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADD231PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD231PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD231PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD231PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADD231PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADD231PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADD231PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADD231PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB132PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB132PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB132PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB132PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB132PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB132PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB132PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB132PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB213PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB213PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB213PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB213PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB213PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB213PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB213PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB213PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB231PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB231PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB231PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB231PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMADDSUB231PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMADDSUB231PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMADDSUB231PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMADDSUB231PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB132PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB132PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB132PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB132PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB132PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB132PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB132PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB132PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB213PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB213PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB213PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB213PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB213PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB213PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB213PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB213PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB231PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB231PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB231PDYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB231PDZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUB231PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUB231PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUB231PSYr               $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUB231PSZ256r                     $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD132PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD132PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD132PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD132PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD132PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD132PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD132PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD132PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD213PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD213PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD213PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD213PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD213PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD213PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD213PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD213PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD231PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD231PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD231PDYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD231PDZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFMSUBADD231PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFMSUBADD231PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFMSUBADD231PSYr            $ymm0, $ymm1, $ymm2
-  $ymm0 = VFMSUBADD231PSZ256r                  $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD132PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD132PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD132PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD132PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD132PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD132PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD132PSYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD132PSZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD213PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD213PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD213PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD213PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD213PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD213PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD213PSYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD213PSZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD231PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD231PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD231PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD231PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMADD231PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMADD231PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMADD231PSYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMADD231PSZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB132PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB132PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB132PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMSUB132PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB132PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB132PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB132PSYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMSUB132PSZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB213PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB213PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB213PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMSUB213PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB213PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB213PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB213PSYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMSUB213PSZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB231PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB231PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB231PDYr              $ymm0, $ymm1, $ymm2
-  $ymm0 = VFNMSUB231PDZ256r                    $ymm0, $ymm1, $ymm2                            
-  ; CHECK: $ymm0 = VFNMSUB231PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg
-  $ymm0 = VFNMSUB231PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg                 
-  ; CHECK: $ymm0 = VFNMSUB231PSYr              $ymm0, $ymm1, $ymm2                        
-  $ymm0 = VFNMSUB231PSZ256r                    $ymm0, $ymm1, $ymm2                                               
+  ; CHECK: $ymm0 = VFMADD132PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD132PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD132PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD132PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD132PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD132PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD132PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD132PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD213PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD213PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD213PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD213PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD213PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD213PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD213PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD213PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD231PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD231PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD231PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD231PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD231PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADD231PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADD231PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADD231PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB132PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB132PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB132PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB132PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB132PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB132PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB132PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB132PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB213PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB213PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB213PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB213PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB213PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB213PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB213PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB213PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB231PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB231PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB231PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB231PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB231PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMADDSUB231PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMADDSUB231PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMADDSUB231PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB132PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB132PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB132PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB132PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB132PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB132PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB132PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB132PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB213PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB213PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB213PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB213PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB213PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB213PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB213PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB213PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB231PDYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB231PDZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB231PDYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB231PDZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB231PSYm               $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUB231PSZ256m                     $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUB231PSYr               $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUB231PSZ256r                     $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD132PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD132PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD132PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD132PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD132PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD132PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD132PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD132PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD213PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD213PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD213PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD213PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD213PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD213PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD213PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD213PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD231PDYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD231PDZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD231PDYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD231PDZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD231PSYm            $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFMSUBADD231PSZ256m                  $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFMSUBADD231PSYr            $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFMSUBADD231PSZ256r                  $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD132PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD132PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD132PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD132PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD132PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD132PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD132PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD132PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD213PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD213PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD213PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD213PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD213PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD213PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD213PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD213PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD231PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD231PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD231PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD231PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD231PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMADD231PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMADD231PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMADD231PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB132PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB132PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB132PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB132PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB132PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB132PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB132PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB132PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB213PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB213PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB213PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB213PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB213PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB213PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB213PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB213PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB231PDYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB231PDZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB231PDYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB231PDZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB231PSYm              $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VFNMSUB231PSZ256m                    $ymm0, $ymm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VFNMSUB231PSYr              $ymm0, $ymm1, $ymm2, implicit $mxcsr
+  $ymm0 = VFNMSUB231PSZ256r                    $ymm0, $ymm1, $ymm2, implicit $mxcsr
   ; CHECK: $ymm0 = VPSRADYri                   $ymm0, 7
   $ymm0 = VPSRADZ256ri                         $ymm0, 7                                       
   ; CHECK: $ymm0 = VPSRADYrm                   $ymm0, $rip, 1, $rax, 0, $noreg
@@ -811,50 +811,50 @@ body: |
   $ymm0 = VCVTDQ2PDZ256rm                      $rdi, 1, $noreg, 0, $noreg
   ; CHECK: $ymm0 = VCVTDQ2PDYrr                $xmm0
   $ymm0 = VCVTDQ2PDZ256rr                      $xmm0                                          
-  ; CHECK: $ymm0 = VCVTDQ2PSYrm                $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VCVTDQ2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VCVTDQ2PSYrr                $ymm0
-  $ymm0 = VCVTDQ2PSZ256rr                      $ymm0                                          
-  ; CHECK: $xmm0 = VCVTPD2DQYrm                $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPD2DQZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPD2DQYrr                $ymm0
-  $xmm0 = VCVTPD2DQZ256rr                      $ymm0                                          
-  ; CHECK: $xmm0 = VCVTPD2PSYrm                $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPD2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPD2PSYrr                $ymm0
-  $xmm0 = VCVTPD2PSZ256rr                      $ymm0                                          
-  ; CHECK: $ymm0 = VCVTPS2DQYrm                $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VCVTPS2DQZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VCVTPS2DQYrr                $ymm0  
-  $ymm0 = VCVTPS2DQZ256rr                      $ymm0                                          
-  ; CHECK: $ymm0 = VCVTPS2PDYrm                $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VCVTPS2PDZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VCVTPS2PDYrr                $xmm0                                      
-  $ymm0 = VCVTPS2PDZ256rr                      $xmm0                                               
-  ; CHECK: VCVTPS2PHYmr                        $rdi, 1, $noreg, 0, $noreg, $ymm0, 0
-  VCVTPS2PHZ256mr                              $rdi, 1, $noreg, 0, $noreg, $ymm0, 0
-  ; CHECK: $xmm0 = VCVTPS2PHYrr                $ymm0, 0
-  $xmm0 = VCVTPS2PHZ256rr                      $ymm0, 0
-  ; CHECK: $ymm0 = VCVTPH2PSYrm                $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VCVTPH2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VCVTPH2PSYrr                $xmm0      
-  $ymm0 = VCVTPH2PSZ256rr                      $xmm0                                          
-  ; CHECK: $xmm0 = VCVTTPD2DQYrm               $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTTPD2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTTPD2DQYrr               $ymm0
-  $xmm0 = VCVTTPD2DQZ256rr                     $ymm0                                          
-  ; CHECK: $ymm0 = VCVTTPS2DQYrm               $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VCVTTPS2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VCVTTPS2DQYrr               $ymm0                                      
-  $ymm0 = VCVTTPS2DQZ256rr                     $ymm0                                               
-  ; CHECK: $ymm0 = VSQRTPDYm                   $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VSQRTPDZ256m                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VSQRTPDYr                   $ymm0
-  $ymm0 = VSQRTPDZ256r                         $ymm0                                          
-  ; CHECK: $ymm0 = VSQRTPSYm                   $rdi, 1, $noreg, 0, $noreg
-  $ymm0 = VSQRTPSZ256m                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm0 = VSQRTPSYr                   $ymm0                                      
-  $ymm0 = VSQRTPSZ256r                         $ymm0                                                 
+  ; CHECK: $ymm0 = VCVTDQ2PSYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VCVTDQ2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTDQ2PSYrr                $ymm0, implicit $mxcsr
+  $ymm0 = VCVTDQ2PSZ256rr                      $ymm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2DQYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPD2DQZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2DQYrr                $ymm0, implicit $mxcsr
+  $xmm0 = VCVTPD2DQZ256rr                      $ymm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2PSYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPD2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2PSYrr                $ymm0, implicit $mxcsr
+  $xmm0 = VCVTPD2PSZ256rr                      $ymm0, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPS2DQYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VCVTPS2DQZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPS2DQYrr                $ymm0, implicit $mxcsr
+  $ymm0 = VCVTPS2DQZ256rr                      $ymm0, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPS2PDYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VCVTPS2PDZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPS2PDYrr                $xmm0, implicit $mxcsr
+  $ymm0 = VCVTPS2PDZ256rr                      $xmm0, implicit $mxcsr
+  ; CHECK: VCVTPS2PHYmr                        $rdi, 1, $noreg, 0, $noreg, $ymm0, 0, implicit $mxcsr
+  VCVTPS2PHZ256mr                              $rdi, 1, $noreg, 0, $noreg, $ymm0, 0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2PHYrr                $ymm0, 0, implicit $mxcsr
+  $xmm0 = VCVTPS2PHZ256rr                      $ymm0, 0, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPH2PSYrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VCVTPH2PSZ256rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTPH2PSYrr                $xmm0, implicit $mxcsr
+  $ymm0 = VCVTPH2PSZ256rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPD2DQYrm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTTPD2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPD2DQYrr               $ymm0, implicit $mxcsr
+  $xmm0 = VCVTTPD2DQZ256rr                     $ymm0, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTTPS2DQYrm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VCVTTPS2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VCVTTPS2DQYrr               $ymm0, implicit $mxcsr
+  $ymm0 = VCVTTPS2DQZ256rr                     $ymm0, implicit $mxcsr
+  ; CHECK: $ymm0 = VSQRTPDYm                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VSQRTPDZ256m                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VSQRTPDYr                   $ymm0, implicit $mxcsr
+  $ymm0 = VSQRTPDZ256r                         $ymm0, implicit $mxcsr
+  ; CHECK: $ymm0 = VSQRTPSYm                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm0 = VSQRTPSZ256m                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm0 = VSQRTPSYr                   $ymm0, implicit $mxcsr
+  $ymm0 = VSQRTPSZ256r                         $ymm0, implicit $mxcsr
   ; CHECK: $ymm0 = VPALIGNRYrmi                $ymm0, $rdi, 1, $noreg, 0, $noreg, 1
   $ymm0 = VPALIGNRZ256rmi                      $ymm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $ymm0 = VPALIGNRYrri                $ymm0, $ymm1, 1
@@ -889,14 +889,14 @@ body: |
   $ymm0 = VSHUFPSZ256rmi                       $ymm0, $rdi, 1, $noreg, 0, $noreg, -24
   ; CHECK: $ymm0 = VSHUFPSYrri                 $ymm0, $ymm1, -24
   $ymm0 = VSHUFPSZ256rri                       $ymm0, $ymm1, -24
-  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $rax, 0, $noreg, 15
-  $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15
-  $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15
-  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $rax, 0, $noreg, 15
-  $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15
-  $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15
+  ; CHECK: $ymm0 = VROUNDPDYm                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPDYr                  $ymm0, 15, implicit $mxcsr
+  $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYm                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VROUNDPSYr                  $ymm0, 15, implicit $mxcsr
+  $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 15, implicit $mxcsr
   ; CHECK: $ymm0 = VPERM2F128rm                $ymm0, $rip, 1, $rax, 0, $noreg, 32
   $ymm0 = VSHUFF32X4Z256rmi                    $ymm0, $rip, 1, $rax, 0, $noreg, 228
   ; CHECK: $ymm0 = VPERM2F128rr                $ymm0, $ymm1, 32
@@ -1075,46 +1075,46 @@ body: |
   VMOVLPSZ128mr                                $rdi, 1, $noreg, 0, $noreg, $xmm0                             
   ; CHECK: $xmm0 = VMOVLPSrm                   $xmm0,  $rdi, 1, $noreg, 0, $noreg                
   $xmm0 = VMOVLPSZ128rm                        $xmm0,  $rdi, 1, $noreg, 0, $noreg                                               
-  ; CHECK: $xmm0 = VMAXCPDrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXCPDZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXCPDrr                   $xmm0, $xmm1  
-  $xmm0 = VMAXCPDZ128rr                        $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMAXCPSrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXCPSZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXCPSrr                   $xmm0, $xmm1
-  $xmm0 = VMAXCPSZ128rr                        $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMAXPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXPDrr                    $xmm0, $xmm1
-  $xmm0 = VMAXPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMAXPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXPSrr                    $xmm0, $xmm1
-  $xmm0 = VMAXPSZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMINCPDrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINCPDZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINCPDrr                   $xmm0, $xmm1  
-  $xmm0 = VMINCPDZ128rr                        $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMINCPSrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINCPSZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINCPSrr                   $xmm0, $xmm1
-  $xmm0 = VMINCPSZ128rr                        $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMINPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINPDrr                    $xmm0, $xmm1
-  $xmm0 = VMINPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMINPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINPSrr                    $xmm0, $xmm1
-  $xmm0 = VMINPSZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMULPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULPDrr                    $xmm0, $xmm1
-  $xmm0 = VMULPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VMULPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULPSrr                    $xmm0, $xmm1
-  $xmm0 = VMULPSZ128rr                         $xmm0, $xmm1                                        
+  ; CHECK: $xmm0 = VMAXCPDrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXCPDZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCPDrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXCPDZ128rr                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCPSrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXCPSZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCPSrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXCPSZ128rr                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCPDrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINCPDZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCPDrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINCPDZ128rr                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCPSrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINCPSZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCPSrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINCPSZ128rr                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
   ; CHECK: $xmm0 = VORPDrm                     $xmm0, $rip, 1, $rax, 0, $noreg
   $xmm0 = VORPDZ128rm                          $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VORPDrr                     $xmm0, $xmm1
@@ -1295,14 +1295,14 @@ body: |
   $xmm0 = VPSUBWZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VPSUBWrr                    $xmm0, $xmm1                            
   $xmm0 = VPSUBWZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VADDPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDPDrr                    $xmm0, $xmm1  
-  $xmm0 = VADDPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VADDPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDPSrr                    $xmm0, $xmm1
-  $xmm0 = VADDPSZ128rr                         $xmm0, $xmm1                                        
+  ; CHECK: $xmm0 = VADDPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
   ; CHECK: $xmm0 = VANDNPDrm                   $xmm0, $rip, 1, $rax, 0, $noreg
   $xmm0 = VANDNPDZ128rm                        $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VANDNPDrr                   $xmm0, $xmm1
@@ -1319,14 +1319,14 @@ body: |
   $xmm0 = VANDPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VANDPSrr                    $xmm0, $xmm1
   $xmm0 = VANDPSZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VDIVPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVPDrr                    $xmm0, $xmm1
-  $xmm0 = VDIVPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VDIVPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVPSrr                    $xmm0, $xmm1
-  $xmm0 = VDIVPSZ128rr                         $xmm0, $xmm1                                        
+  ; CHECK: $xmm0 = VDIVPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
   ; CHECK: $xmm0 = VPXORrm                     $xmm0, $rip, 1, $rax, 0, $noreg
   $xmm0 = VPXORDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VPXORrr                     $xmm0, $xmm1
@@ -1335,14 +1335,14 @@ body: |
   $xmm0 = VPXORQZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VPXORrr                     $xmm0, $xmm1
   $xmm0 = VPXORQZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VSUBPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBPDrr                    $xmm0, $xmm1
-  $xmm0 = VSUBPDZ128rr                         $xmm0, $xmm1                                        
-  ; CHECK: $xmm0 = VSUBPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBPSrr                    $xmm0, $xmm1                  
-  $xmm0 = VSUBPSZ128rr                         $xmm0, $xmm1                                        
+  ; CHECK: $xmm0 = VSUBPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBPDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBPDZ128rr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBPSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBPSZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBPSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBPSZ128rr                         $xmm0, $xmm1, implicit $mxcsr
   ; CHECK: $xmm0 = VXORPDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
   $xmm0 = VXORPDZ128rm                         $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VXORPDrr                    $xmm0, $xmm1
@@ -1423,150 +1423,150 @@ body: |
   $xmm0 = VUNPCKLPSZ128rm                      $xmm0, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VUNPCKLPSrr                 $xmm0, $xmm1                            
   $xmm0 = VUNPCKLPSZ128rr                      $xmm0, $xmm1                                                                                              
-  ; CHECK: $xmm0 = VFMADD132PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD132PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADD132PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD132PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADD213PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD213PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADD213PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD213PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADD231PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD231PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADD231PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADD231PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB132PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB132PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB132PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB132PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB132PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB132PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB132PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB132PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB213PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB213PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB213PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB213PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB213PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB213PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB213PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB213PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB231PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB231PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB231PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB231PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMADDSUB231PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADDSUB231PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMADDSUB231PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADDSUB231PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB132PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB132PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB132PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB132PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB213PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB213PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB213PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB213PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB231PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB231PDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231PDZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUB231PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUB231PSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231PSZ128r                     $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD132PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD132PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD132PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD132PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD132PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD132PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD132PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD132PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD213PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD213PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD213PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD213PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD213PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD213PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD213PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD213PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD231PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD231PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD231PDr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD231PDZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFMSUBADD231PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUBADD231PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFMSUBADD231PSr             $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUBADD231PSZ128r                  $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD132PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD132PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD132PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD132PSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132PSZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD213PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD213PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD213PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD213PSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213PSZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD231PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD231PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMADD231PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMADD231PSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231PSZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB132PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB132PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB132PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB132PSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132PSZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB213PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB213PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB213PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB213PSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213PSZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB231PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB231PDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB231PDZ128r                    $xmm0, $xmm1, $xmm2                                 
-  ; CHECK: $xmm0 = VFNMSUB231PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                      
-  ; CHECK: $xmm0 = VFNMSUB231PSr               $xmm0, $xmm1, $xmm2                     
-  $xmm0 = VFNMSUB231PSZ128r                    $xmm0, $xmm1, $xmm2                                               
+  ; CHECK: $xmm0 = VFMADD132PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB132PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB132PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB132PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB132PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB132PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB132PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB132PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB132PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB213PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB213PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB213PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB213PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB213PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB213PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB213PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB213PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB231PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB231PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB231PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB231PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB231PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADDSUB231PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADDSUB231PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADDSUB231PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231PDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231PDZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231PDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231PDZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231PSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231PSZ128m                     $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231PSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231PSZ128r                     $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD132PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD132PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD132PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD132PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD132PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD132PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD132PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD132PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD213PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD213PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD213PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD213PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD213PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD213PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD213PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD213PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD231PDm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD231PDZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD231PDr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD231PDZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD231PSm             $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUBADD231PSZ128m                  $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUBADD231PSr             $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUBADD231PSZ128r                  $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231PDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231PDZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231PDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231PDZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231PSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231PSZ128m                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231PSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231PSZ128r                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
   ; CHECK: $xmm0 = VPSLLDri                    $xmm0, 7
   $xmm0 = VPSLLDZ128ri                         $xmm0, 7                                            
   ; CHECK: $xmm0 = VPSLLDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
@@ -1653,50 +1653,50 @@ body: |
   $xmm0 = VPERMILPSZ128rm                      $xmm0, $rdi, 1, $noreg, 0, $noreg                             
   ; CHECK: $xmm0 = VPERMILPSrr                 $xmm0, $xmm1                            
   $xmm0 = VPERMILPSZ128rr                      $xmm0, $xmm1                                               
-  ; CHECK: $xmm0 = VCVTPH2PSrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPH2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPH2PSrr                 $xmm0
-  $xmm0 = VCVTPH2PSZ128rr                      $xmm0                                               
+  ; CHECK: $xmm0 = VCVTPH2PSrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPH2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPH2PSrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTPH2PSZ128rr                      $xmm0, implicit $mxcsr
   ; CHECK: $xmm0 = VCVTDQ2PDrm                 $rdi, 1, $noreg, 0, $noreg
   $xmm0 = VCVTDQ2PDZ128rm                      $rdi, 1, $noreg, 0, $noreg
   ; CHECK: $xmm0 = VCVTDQ2PDrr                 $xmm0     
   $xmm0 = VCVTDQ2PDZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTDQ2PSrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTDQ2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTDQ2PSrr                 $xmm0   
-  $xmm0 = VCVTDQ2PSZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTPD2DQrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPD2DQZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPD2DQrr                 $xmm0   
-  $xmm0 = VCVTPD2DQZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTPD2PSrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPD2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPD2PSrr                 $xmm0   
-  $xmm0 = VCVTPD2PSZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTPS2DQrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPS2DQZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPS2DQrr                 $xmm0   
-  $xmm0 = VCVTPS2DQZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTPS2PDrm                 $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTPS2PDZ128rm                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTPS2PDrr                 $xmm0
-  $xmm0 = VCVTPS2PDZ128rr                      $xmm0                                               
-  ; CHECK: $xmm0 = VCVTTPD2DQrm                $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTTPD2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTTPD2DQrr                $xmm0  
-  $xmm0 = VCVTTPD2DQZ128rr                     $xmm0                                               
-  ; CHECK: $xmm0 = VCVTTPS2DQrm                $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTTPS2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTTPS2DQrr                $xmm0                                   
-  $xmm0 = VCVTTPS2DQZ128rr                     $xmm0                                               
-  ; CHECK: $xmm0 = VSQRTPDm                    $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTPDZ128m                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTPDr                    $xmm0
-  $xmm0 = VSQRTPDZ128r                         $xmm0                                               
-  ; CHECK: $xmm0 = VSQRTPSm                    $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTPSZ128m                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTPSr                    $xmm0                                   
-  $xmm0 = VSQRTPSZ128r                         $xmm0                                               
+  ; CHECK: $xmm0 = VCVTDQ2PSrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTDQ2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTDQ2PSrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTDQ2PSZ128rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2DQrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPD2DQZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2DQrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTPD2DQZ128rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2PSrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPD2PSZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPD2PSrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTPD2PSZ128rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2DQrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPS2DQZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2DQrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTPS2DQZ128rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2PDrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTPS2PDZ128rm                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2PDrr                 $xmm0, implicit $mxcsr
+  $xmm0 = VCVTPS2PDZ128rr                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPD2DQrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTTPD2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPD2DQrr                $xmm0, implicit $mxcsr
+  $xmm0 = VCVTTPD2DQZ128rr                     $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPS2DQrm                $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTTPS2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTTPS2DQrr                $xmm0, implicit $mxcsr
+  $xmm0 = VCVTTPS2DQZ128rr                     $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTPDm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTPDZ128m                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTPDr                    $xmm0, implicit $mxcsr
+  $xmm0 = VSQRTPDZ128r                         $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTPSm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTPSZ128m                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTPSr                    $xmm0, implicit $mxcsr
+  $xmm0 = VSQRTPSZ128r                         $xmm0, implicit $mxcsr
   ; CHECK: $xmm0 = VMOVDDUPrm                  $rdi, 1, $noreg, 0, $noreg     
   $xmm0 = VMOVDDUPZ128rm                       $rdi, 1, $noreg, 0, $noreg                                    
   ; CHECK: $xmm0 = VMOVDDUPrr                  $xmm0    
@@ -1763,10 +1763,10 @@ body: |
   $xmm0 = VBROADCASTI32X2Z128m                 $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VPBROADCASTQrr              $xmm0
   $xmm0 = VBROADCASTI32X2Z128r                 $xmm0
-  ; CHECK: $xmm0 = VCVTPS2PHrr                 $xmm0, 2
-  $xmm0 = VCVTPS2PHZ128rr                      $xmm0, 2                                            
-  ; CHECK: VCVTPS2PHmr                         $rdi, 1, $noreg, 0, $noreg, $xmm0, 2
-  VCVTPS2PHZ128mr                              $rdi, 1, $noreg, 0, $noreg, $xmm0, 2
+  ; CHECK: $xmm0 = VCVTPS2PHrr                 $xmm0, 2, implicit $mxcsr
+  $xmm0 = VCVTPS2PHZ128rr                      $xmm0, 2, implicit $mxcsr
+  ; CHECK: VCVTPS2PHmr                         $rdi, 1, $noreg, 0, $noreg, $xmm0, 2, implicit $mxcsr
+  VCVTPS2PHZ128mr                              $rdi, 1, $noreg, 0, $noreg, $xmm0, 2, implicit $mxcsr
   ; CHECK: $xmm0 = VPABSBrm                    $rip, 1, $rax, 0, $noreg
   $xmm0 = VPABSBZ128rm                         $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm0 = VPABSBrr                    $xmm0
@@ -1791,14 +1791,14 @@ body: |
   $xmm0 = VALIGNQZ128rmi                       $xmm0, $rip, 1, $rax, 0, $noreg, 1
   ; CHECK: $xmm0 = VPALIGNRrri                 $xmm0, $xmm1, 8
   $xmm0 = VALIGNQZ128rri                       $xmm0, $xmm1, 1
-  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15
-  $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15
-  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15
-  $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15
+  ; CHECK: $xmm0 = VROUNDPDm                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPDr                   $xmm0, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSm                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDPSr                   $xmm0, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 15, implicit $mxcsr
 
       RET 0, $zmm0, $zmm1
 ...
@@ -1810,310 +1810,310 @@ name: evex_scalar_to_vex_test
 body: |
   bb.0:
 
-  ; CHECK: $xmm0 = VADDSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDSDrr                    $xmm0, $xmm1  
-  $xmm0 = VADDSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VADDSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VADDSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VADDSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VADDSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VADDSSrr                    $xmm0, $xmm1
-  $xmm0 = VADDSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VADDSSrr_Int                $xmm0, $xmm1
-  $xmm0 = VADDSSZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VDIVSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVSDrr                    $xmm0, $xmm1  
-  $xmm0 = VDIVSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VDIVSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VDIVSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VDIVSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VDIVSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VDIVSSrr                    $xmm0, $xmm1
-  $xmm0 = VDIVSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VDIVSSrr_Int                $xmm0, $xmm1
-  $xmm0 = VDIVSSZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXCSDrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXCSDZrm                           $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXCSDrr                   $xmm0, $xmm1
-  $xmm0 = VMAXCSDZrr                           $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXCSSrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXCSSZrm                           $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXCSSrr                   $xmm0, $xmm1
-  $xmm0 = VMAXCSSZrr                           $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXSDrr                    $xmm0, $xmm1
-  $xmm0 = VMAXSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VMAXSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMAXSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMAXSSrr                    $xmm0, $xmm1
-  $xmm0 = VMAXSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMAXSSrr_Int                $xmm0, $xmm1
-  $xmm0 = VMAXSSZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINCSDrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINCSDZrm                           $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINCSDrr                   $xmm0, $xmm1
-  $xmm0 = VMINCSDZrr                           $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINCSSrm                   $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINCSSZrm                           $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINCSSrr                   $xmm0, $xmm1
-  $xmm0 = VMINCSSZrr                           $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINSDrr                    $xmm0, $xmm1
-  $xmm0 = VMINSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VMINSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMINSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMINSSrr                    $xmm0, $xmm1
-  $xmm0 = VMINSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMINSSrr_Int                $xmm0, $xmm1
-  $xmm0 = VMINSSZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMULSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULSDrr                    $xmm0, $xmm1
-  $xmm0 = VMULSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMULSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VMULSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMULSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VMULSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VMULSSrr                    $xmm0, $xmm1  
-  $xmm0 = VMULSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VMULSSrr_Int                $xmm0, $xmm1
-  $xmm0 = VMULSSZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VSUBSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBSDrr                    $xmm0, $xmm1  
-  $xmm0 = VSUBSDZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VSUBSDrr_Int                $xmm0, $xmm1
-  $xmm0 = VSUBSDZrr_Int                        $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VSUBSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg
-  $xmm0 = VSUBSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm0 = VSUBSSrr                    $xmm0, $xmm1
-  $xmm0 = VSUBSSZrr                            $xmm0, $xmm1                                            
-  ; CHECK: $xmm0 = VSUBSSrr_Int                $xmm0, $xmm1                                               
-  $xmm0 = VSUBSSZrr_Int                        $xmm0, $xmm1                                               
-  ; CHECK: $xmm0 = VFMADD132SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD132SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD132SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD132SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD132SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD132SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD132SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD132SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD132SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD132SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD213SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD213SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD213SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD213SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD213SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD213SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD213SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD213SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD213SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD213SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD231SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD231SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD231SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD231SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD231SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD231SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMADD231SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMADD231SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMADD231SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMADD231SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB132SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB132SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB132SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB132SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB132SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB132SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB132SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB132SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB132SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB132SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB213SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB213SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB213SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB213SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB213SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB213SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB213SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB213SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB213SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB213SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB231SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB231SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB231SDr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231SDZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB231SDr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231SDZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB231SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB231SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFMSUB231SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFMSUB231SSr                $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231SSZr                        $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFMSUB231SSr_Int            $xmm0, $xmm1, $xmm2
-  $xmm0 = VFMSUB231SSZr_Int                    $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD132SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD132SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD132SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD132SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD132SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD132SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD132SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD132SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD132SSr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD132SSZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD213SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD213SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD213SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD213SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD213SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD213SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD213SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD213SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD213SSr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD213SSZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD231SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD231SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD231SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD231SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD231SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD231SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMADD231SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMADD231SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMADD231SSr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMADD231SSZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB132SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB132SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB132SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB132SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB132SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB132SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB132SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB132SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB132SSr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB132SSZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB213SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB213SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB213SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB213SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB213SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB213SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB213SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB213SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB213SSr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB213SSZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB231SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB231SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB231SDr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB231SDZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB231SDr_Int           $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB231SDZr_Int                   $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB231SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB231SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg
-  $xmm0 = VFNMSUB231SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm0 = VFNMSUB231SSr               $xmm0, $xmm1, $xmm2
-  $xmm0 = VFNMSUB231SSZr                       $xmm0, $xmm1, $xmm2                                     
-  ; CHECK: $xmm0 = VFNMSUB231SSr_Int           $xmm0, $xmm1, $xmm2                                               
-  $xmm0 = VFNMSUB231SSZr_Int                   $xmm0, $xmm1, $xmm2                                               
+  ; CHECK: $xmm0 = VADDSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VADDSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VADDSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VADDSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VDIVSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VDIVSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VDIVSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCSDrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXCSDZrm                           $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCSDrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXCSDZrr                           $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCSSrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXCSSZrm                           $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXCSSrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXCSSZrr                           $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMAXSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMAXSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMAXSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCSDrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINCSDZrm                           $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCSDrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINCSDZrr                           $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCSSrm                   $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINCSSZrm                           $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINCSSrr                   $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINCSSZrr                           $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMINSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMINSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMINSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VMULSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VMULSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VMULSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSDrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBSDZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSDrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBSDZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSDrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBSDZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSDrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBSDZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSSrm                    $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBSSZrm                            $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSSrm_Int                $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSUBSSZrm_Int                        $xmm0, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSSrr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBSSZrr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSUBSSrr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSUBSSZrr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD132SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD132SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD132SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD213SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD213SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD213SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMADD231SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMADD231SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMADD231SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB132SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB132SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB132SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB213SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB213SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB213SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SDm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231SDZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SDm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231SDZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SDr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231SDZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SDr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231SDZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SSm                $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231SSZm                        $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SSm_Int            $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFMSUB231SSZm_Int                    $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SSr                $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231SSZr                        $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFMSUB231SSr_Int            $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFMSUB231SSZr_Int                    $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD132SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD132SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD132SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD213SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD213SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD213SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMADD231SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMADD231SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMADD231SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB132SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB132SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB132SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB213SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB213SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB213SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SDm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231SDZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SDm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231SDZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SDr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231SDZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SDr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231SDZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SSm               $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231SSZm                       $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SSm_Int           $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VFNMSUB231SSZm_Int                   $xmm0, $xmm0, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SSr               $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231SSZr                       $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm0 = VFNMSUB231SSr_Int           $xmm0, $xmm1, $xmm2, implicit $mxcsr
+  $xmm0 = VFNMSUB231SSZr_Int                   $xmm0, $xmm1, $xmm2, implicit $mxcsr
   ; CHECK: VPEXTRBmr                           $rdi, 1, $noreg, 0, $noreg, $xmm0, 3       
   VPEXTRBZmr                                   $rdi, 1, $noreg, 0, $noreg, $xmm0, 3                              
   ; CHECK: $eax = VPEXTRBrr                    $xmm0, 1    
@@ -2148,34 +2148,34 @@ body: |
   $xmm0 = VPINSRWZrm                           $xmm0, $rsi, 1, $noreg, 0, $noreg, 3                              
   ; CHECK: $xmm0 = VPINSRWrr                   $xmm0, $edi, 5                                               
   $xmm0 = VPINSRWZrr                           $xmm0, $edi, 5                                              
-  ; CHECK: $xmm0 = VSQRTSDm                    $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTSDZm                            $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTSDm_Int                $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTSDZm_Int                        $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTSDr                    $xmm0, $noreg 
-  $xmm0 = VSQRTSDZr                            $xmm0, $noreg                                                
-  ; CHECK: $xmm0 = VSQRTSDr_Int                $xmm0, $noreg
-  $xmm0 = VSQRTSDZr_Int                        $xmm0, $noreg                                                
-  ; CHECK: $xmm0 = VSQRTSSm                    $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTSSZm                            $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTSSm_Int                $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VSQRTSSZm_Int                        $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VSQRTSSr                    $xmm0, $xmm1
-  $xmm0 = VSQRTSSZr                            $xmm0, $xmm1
-  ; CHECK: $xmm0 = VSQRTSSr_Int                $xmm0, $xmm1
-  $xmm0 = VSQRTSSZr_Int                        $xmm0, $xmm1
-  ; CHECK: $rdi = VCVTSD2SI64rr_Int            $xmm0
-  $rdi = VCVTSD2SI64Zrr_Int                    $xmm0                                                   
-  ; CHECK: $edi = VCVTSD2SIrr_Int              $xmm0
-  $edi = VCVTSD2SIZrr_Int                      $xmm0                                                   
-  ; CHECK: $xmm0 = VCVTSD2SSrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSD2SSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSD2SSrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSD2SSZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSD2SSrr                 $xmm0, $xmm1
-  $xmm0 = VCVTSD2SSZrr                         $xmm0, $xmm1
-  ; CHECK: $xmm0 = VCVTSD2SSrr_Int             $xmm0, $xmm1
-  $xmm0 = VCVTSD2SSZrr_Int                     $xmm0, $xmm1
+  ; CHECK: $xmm0 = VSQRTSDm                    $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSDZm                            $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSDm_Int                $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSDZm_Int                        $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSDr                    $xmm0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSDZr                            $xmm0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSDr_Int                $xmm0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSDZr_Int                        $xmm0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSSm                    $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSSZm                            $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSSm_Int                $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VSQRTSSZm_Int                        $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSSr                    $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSQRTSSZr                            $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VSQRTSSr_Int                $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VSQRTSSZr_Int                        $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSD2SI64rr_Int            $xmm0, implicit $mxcsr
+  $rdi = VCVTSD2SI64Zrr_Int                    $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTSD2SIrr_Int              $xmm0, implicit $mxcsr
+  $edi = VCVTSD2SIZrr_Int                      $xmm0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSD2SSrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSD2SSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSD2SSrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSD2SSZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSD2SSrr                 $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VCVTSD2SSZrr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSD2SSrr_Int             $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VCVTSD2SSZrr_Int                     $xmm0, $xmm1, implicit $mxcsr
   ; CHECK: $xmm0 = VCVTSI2SDrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg
   $xmm0 = VCVTSI2SDZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
   ; CHECK: $xmm0 = VCVTSI2SDrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg
@@ -2184,78 +2184,78 @@ body: |
   $xmm0 = VCVTSI2SDZrr                         $xmm0, $edi
   ; CHECK: $xmm0 = VCVTSI2SDrr_Int             $xmm0, $edi
   $xmm0 = VCVTSI2SDZrr_Int                     $xmm0, $edi
-  ; CHECK: $xmm0 = VCVTSI2SSrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSI2SSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSI2SSrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSI2SSZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSI2SSrr                 $xmm0, $edi
-  $xmm0 = VCVTSI2SSZrr                         $xmm0, $edi
-  ; CHECK: $xmm0 = VCVTSI2SSrr_Int             $xmm0, $edi
-  $xmm0 = VCVTSI2SSZrr_Int                     $xmm0, $edi
-  ; CHECK: $xmm0 = VCVTSI642SDrm               $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSI642SDZrm                       $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTSI642SDrm_Int           $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSI642SDZrm_Int                   $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTSI642SDrr               $xmm0, $rdi
-  $xmm0 = VCVTSI642SDZrr                       $xmm0, $rdi
-  ; CHECK: $xmm0 = VCVTSI642SDrr_Int           $xmm0, $rdi
-  $xmm0 = VCVTSI642SDZrr_Int                   $xmm0, $rdi
-  ; CHECK: $xmm0 = VCVTSI642SSrm               $xmm0, $rdi, 1, $noreg, 0, $noreg 
-  $xmm0 = VCVTSI642SSZrm                       $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTSI642SSrm_Int           $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSI642SSZrm_Int                   $xmm0, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm0 = VCVTSI642SSrr               $xmm0, $rdi
-  $xmm0 = VCVTSI642SSZrr                       $xmm0, $rdi
-  ; CHECK: $xmm0 = VCVTSI642SSrr_Int           $xmm0, $rdi
-  $xmm0 = VCVTSI642SSZrr_Int                   $xmm0, $rdi
-  ; CHECK: $xmm0 = VCVTSS2SDrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSS2SDZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSS2SDrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg
-  $xmm0 = VCVTSS2SDZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg                                 
-  ; CHECK: $xmm0 = VCVTSS2SDrr                 $xmm0, $xmm1
-  $xmm0 = VCVTSS2SDZrr                         $xmm0, $xmm1
-  ; CHECK: $xmm0 = VCVTSS2SDrr_Int             $xmm0, $xmm1
-  $xmm0 = VCVTSS2SDZrr_Int                     $xmm0, $xmm1
-  ; CHECK: $rdi = VCVTSS2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTSS2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTSS2SI64rr_Int            $xmm0
-  $rdi = VCVTSS2SI64Zrr_Int                    $xmm0                                                   
-  ; CHECK: $edi = VCVTSS2SIrm_Int              $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTSS2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTSS2SIrr_Int              $xmm0
-  $edi = VCVTSS2SIZrr_Int                      $xmm0                                                   
-  ; CHECK: $rdi = VCVTTSD2SI64rm               $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSD2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSD2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSD2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSD2SI64rr               $xmm0
-  $rdi = VCVTTSD2SI64Zrr                       $xmm0                                                   
-  ; CHECK: $rdi = VCVTTSD2SI64rr_Int           $xmm0
-  $rdi = VCVTTSD2SI64Zrr_Int                   $xmm0                                                   
-  ; CHECK: $edi = VCVTTSD2SIrm                 $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSD2SIZrm                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSD2SIrm_Int             $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSD2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSD2SIrr                 $xmm0
-  $edi = VCVTTSD2SIZrr                         $xmm0                                                   
-  ; CHECK: $edi = VCVTTSD2SIrr_Int             $xmm0
-  $edi = VCVTTSD2SIZrr_Int                     $xmm0                                                   
-  ; CHECK: $rdi = VCVTTSS2SI64rm               $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSS2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSS2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSS2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSS2SI64rr               $xmm0
-  $rdi = VCVTTSS2SI64Zrr                       $xmm0                                                   
-  ; CHECK: $rdi = VCVTTSS2SI64rr_Int           $xmm0
-  $rdi = VCVTTSS2SI64Zrr_Int                   $xmm0                                                   
-  ; CHECK: $edi = VCVTTSS2SIrm                 $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSS2SIZrm                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSS2SIrm_Int             $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSS2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSS2SIrr                 $xmm0
-  $edi = VCVTTSS2SIZrr                         $xmm0                                                   
-  ; CHECK: $edi = VCVTTSS2SIrr_Int             $xmm0                                               
-  $edi = VCVTTSS2SIZrr_Int                     $xmm0                                                   
+  ; CHECK: $xmm0 = VCVTSI2SSrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI2SSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI2SSrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI2SSZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI2SSrr                 $xmm0, $edi, implicit $mxcsr
+  $xmm0 = VCVTSI2SSZrr                         $xmm0, $edi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI2SSrr_Int             $xmm0, $edi, implicit $mxcsr
+  $xmm0 = VCVTSI2SSZrr_Int                     $xmm0, $edi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SDrm               $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI642SDZrm                       $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SDrm_Int           $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI642SDZrm_Int                   $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SDrr               $xmm0, $rdi, implicit $mxcsr
+  $xmm0 = VCVTSI642SDZrr                       $xmm0, $rdi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SDrr_Int           $xmm0, $rdi, implicit $mxcsr
+  $xmm0 = VCVTSI642SDZrr_Int                   $xmm0, $rdi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SSrm               $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI642SSZrm                       $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SSrm_Int           $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSI642SSZrm_Int                   $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SSrr               $xmm0, $rdi, implicit $mxcsr
+  $xmm0 = VCVTSI642SSZrr                       $xmm0, $rdi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSI642SSrr_Int           $xmm0, $rdi, implicit $mxcsr
+  $xmm0 = VCVTSI642SSZrr_Int                   $xmm0, $rdi, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSS2SDrm                 $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSS2SDZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSS2SDrm_Int             $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm0 = VCVTSS2SDZrm_Int                     $xmm0, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSS2SDrr                 $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VCVTSS2SDZrr                         $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTSS2SDrr_Int             $xmm0, $xmm1, implicit $mxcsr
+  $xmm0 = VCVTSS2SDZrr_Int                     $xmm0, $xmm1, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSS2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTSS2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSS2SI64rr_Int            $xmm0, implicit $mxcsr
+  $rdi = VCVTSS2SI64Zrr_Int                    $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTSS2SIrm_Int              $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTSS2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTSS2SIrr_Int              $xmm0, implicit $mxcsr
+  $edi = VCVTSS2SIZrr_Int                      $xmm0, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rr               $xmm0, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrr                       $xmm0, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rr_Int           $xmm0, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrr_Int                   $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSD2SIZrm                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrm_Int             $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSD2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrr                 $xmm0, implicit $mxcsr
+  $edi = VCVTTSD2SIZrr                         $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrr_Int             $xmm0, implicit $mxcsr
+  $edi = VCVTTSD2SIZrr_Int                     $xmm0, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rr               $xmm0, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrr                       $xmm0, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rr_Int           $xmm0, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrr_Int                   $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSS2SIZrm                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrm_Int             $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSS2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrr                 $xmm0, implicit $mxcsr
+  $edi = VCVTTSS2SIZrr                         $xmm0, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrr_Int             $xmm0, implicit $mxcsr
+  $edi = VCVTTSS2SIZrr_Int                     $xmm0, implicit $mxcsr
   ; CHECK: $xmm0 = VMOV64toSDrr                $rdi    
   $xmm0 = VMOV64toSDZrr                        $rdi                                                    
   ; CHECK: $xmm0 = VMOVDI2SSrr                 $eax
@@ -2354,22 +2354,22 @@ body: |
   $xmm0 = VINSERTPSZrm                         $xmm0, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm0 = VINSERTPSrr                 $xmm0, $xmm0, 1
   $xmm0 = VINSERTPSZrr                         $xmm0, $xmm0, 1
-  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15
-  $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15
-  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15
-  $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15
-  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15
-  $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15
-  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15
-  $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15
+  ; CHECK: $xmm0 = VROUNDSDm                   $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSm                   $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSr                   $xmm0, $xmm1, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDm_Int               $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSDr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSm_Int               $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VROUNDSSr_Int               $xmm0, $xmm1, 15, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 15, implicit $mxcsr
 
   RET 0, $zmm0, $zmm1                          
 ...
@@ -2530,14 +2530,14 @@ body: |
   $ymm16 = VPADDWZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VPADDWZ256rr               $ymm16, $ymm1
   $ymm16 = VPADDWZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMULPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMULPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMULPDZ256rr               $ymm16, $ymm1
-  $ymm16 = VMULPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMULPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMULPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMULPSZ256rr               $ymm16, $ymm1
-  $ymm16 = VMULPSZ256rr                        $ymm16, $ymm1                                 
+  ; CHECK: $ymm16 = VMULPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMULPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMULPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMULPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMULPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMULPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMULPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMULPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
   ; CHECK: $ymm16 = VORPDZ256rm                $ymm16, $rip, 1, $rax, 0, $noreg
   $ymm16 = VORPDZ256rm                         $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VORPDZ256rr                $ymm16, $ymm1
@@ -2678,14 +2678,14 @@ body: |
   $ymm16 = VPXORQZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VPXORQZ256rr               $ymm16, $ymm1  
   $ymm16 = VPXORQZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VADDPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VADDPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VADDPDZ256rr               $ymm16, $ymm1
-  $ymm16 = VADDPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VADDPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VADDPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VADDPSZ256rr               $ymm16, $ymm1
-  $ymm16 = VADDPSZ256rr                        $ymm16, $ymm1                                 
+  ; CHECK: $ymm16 = VADDPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VADDPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VADDPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VADDPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VADDPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VADDPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VADDPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VADDPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
   ; CHECK: $ymm16 = VANDNPDZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg
   $ymm16 = VANDNPDZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VANDNPDZ256rr              $ymm16, $ymm1
@@ -2702,46 +2702,46 @@ body: |
   $ymm16 = VANDPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VANDPSZ256rr               $ymm16, $ymm1
   $ymm16 = VANDPSZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VDIVPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VDIVPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VDIVPDZ256rr               $ymm16, $ymm1  
-  $ymm16 = VDIVPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VDIVPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VDIVPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VDIVPSZ256rr               $ymm16, $ymm1
-  $ymm16 = VDIVPSZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMAXCPDZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMAXCPDZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMAXCPDZ256rr              $ymm16, $ymm1
-  $ymm16 = VMAXCPDZ256rr                       $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMAXCPSZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMAXCPSZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMAXCPSZ256rr              $ymm16, $ymm1
-  $ymm16 = VMAXCPSZ256rr                       $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMAXPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMAXPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMAXPDZ256rr               $ymm16, $ymm1
-  $ymm16 = VMAXPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMAXPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMAXPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMAXPSZ256rr               $ymm16, $ymm1
-  $ymm16 = VMAXPSZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMINCPDZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMINCPDZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMINCPDZ256rr              $ymm16, $ymm1
-  $ymm16 = VMINCPDZ256rr                       $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMINCPSZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMINCPSZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMINCPSZ256rr              $ymm16, $ymm1
-  $ymm16 = VMINCPSZ256rr                       $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMINPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMINPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMINPDZ256rr               $ymm16, $ymm1
-  $ymm16 = VMINPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VMINPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
-  $ymm16 = VMINPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VMINPSZ256rr               $ymm16, $ymm1
-  $ymm16 = VMINPSZ256rr                        $ymm16, $ymm1                                 
+  ; CHECK: $ymm16 = VDIVPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VDIVPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VDIVPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VDIVPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VDIVPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VDIVPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VDIVPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VDIVPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXCPDZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMAXCPDZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXCPDZ256rr              $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMAXCPDZ256rr                       $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXCPSZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMAXCPSZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXCPSZ256rr              $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMAXCPSZ256rr                       $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMAXPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMAXPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMAXPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMAXPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMAXPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINCPDZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMINCPDZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINCPDZ256rr              $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMINCPDZ256rr                       $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINCPSZ256rm              $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMINCPSZ256rm                       $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINCPSZ256rr              $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMINCPSZ256rr                       $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMINPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMINPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VMINPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VMINPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VMINPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
   ; CHECK: $ymm16 = VXORPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
   $ymm16 = VXORPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VXORPDZ256rr               $ymm16, $ymm1
@@ -2782,14 +2782,14 @@ body: |
   $ymm16 = VUNPCKLPSZ256rm                     $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VUNPCKLPSZ256rr            $ymm16, $ymm1
   $ymm16 = VUNPCKLPSZ256rr                     $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VSUBPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg 
-  $ymm16 = VSUBPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VSUBPDZ256rr               $ymm16, $ymm1 
-  $ymm16 = VSUBPDZ256rr                        $ymm16, $ymm1                                 
-  ; CHECK: $ymm16 = VSUBPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg 
-  $ymm16 = VSUBPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $ymm16 = VSUBPSZ256rr               $ymm16, $ymm1   
-  $ymm16 = VSUBPSZ256rr                        $ymm16, $ymm1                                                
+  ; CHECK: $ymm16 = VSUBPDZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VSUBPDZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VSUBPDZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VSUBPDZ256rr                        $ymm16, $ymm1, implicit $mxcsr
+  ; CHECK: $ymm16 = VSUBPSZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $ymm16 = VSUBPSZ256rm                        $ymm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VSUBPSZ256rr               $ymm16, $ymm1, implicit $mxcsr
+  $ymm16 = VSUBPSZ256rr                        $ymm16, $ymm1, implicit $mxcsr
   ; CHECK: $ymm16 = VPUNPCKHBWZ256rm           $ymm16, $rip, 1, $rax, 0, $noreg
   $ymm16 = VPUNPCKHBWZ256rm                    $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VPUNPCKHBWZ256rr           $ymm16, $ymm1
@@ -2822,150 +2822,150 @@ body: |
   $ymm16 = VPUNPCKLWDZ256rm                    $ymm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $ymm16 = VPUNPCKLWDZ256rr           $ymm16, $ymm1   
   $ymm16 = VPUNPCKLWDZ256rr                    $ymm16, $ymm1                                                
-  ; CHECK: $ymm16 = VFMADD132PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD132PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD132PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD132PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADD132PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD132PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD132PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD132PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADD213PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD213PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD213PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD213PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADD213PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD213PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD213PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD213PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADD231PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD231PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD231PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD231PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADD231PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADD231PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADD231PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADD231PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB132PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB132PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB132PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB132PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB132PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB132PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB132PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB132PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB213PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB213PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB213PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB213PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB213PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB213PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB213PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB213PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB231PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB231PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB231PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB231PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMADDSUB231PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMADDSUB231PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMADDSUB231PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMADDSUB231PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB132PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB132PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB132PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB132PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB132PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB132PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB132PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB132PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB213PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB213PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB213PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB213PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB213PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB213PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB213PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB213PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB231PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB231PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB231PDZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB231PDZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUB231PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUB231PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUB231PSZ256r           $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUB231PSZ256r                    $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD132PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD132PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD132PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD132PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD132PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD132PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD132PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD132PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD213PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD213PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD213PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD213PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD213PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD213PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD213PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD213PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD231PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD231PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD231PDZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD231PDZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFMSUBADD231PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFMSUBADD231PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFMSUBADD231PSZ256r        $ymm16, $ymm1, $ymm2
-  $ymm16 = VFMSUBADD231PSZ256r                 $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD132PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD132PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD132PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD132PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD132PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD132PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD132PSZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD132PSZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD213PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD213PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD213PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD213PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD213PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD213PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD213PSZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD213PSZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD231PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD231PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD231PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD231PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMADD231PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMADD231PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMADD231PSZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMADD231PSZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB132PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB132PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB132PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMSUB132PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB132PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB132PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB132PSZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMSUB132PSZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB213PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB213PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB213PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMSUB213PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB213PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB213PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB213PSZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMSUB213PSZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB231PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB231PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB231PDZ256r          $ymm16, $ymm1, $ymm2
-  $ymm16 = VFNMSUB231PDZ256r                   $ymm16, $ymm1, $ymm2                          
-  ; CHECK: $ymm16 = VFNMSUB231PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg
-  $ymm16 = VFNMSUB231PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg              
-  ; CHECK: $ymm16 = VFNMSUB231PSZ256r          $ymm16, $ymm1, $ymm2  
-  $ymm16 = VFNMSUB231PSZ256r                   $ymm16, $ymm1, $ymm2                                              
+  ; CHECK: $ymm16 = VFMADD132PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD132PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD132PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD132PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD132PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD132PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD132PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD132PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD213PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD213PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD213PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD213PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD213PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD213PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD213PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD213PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD231PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD231PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD231PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD231PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD231PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADD231PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADD231PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADD231PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB132PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB132PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB132PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB132PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB132PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB132PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB132PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB132PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB213PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB213PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB213PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB213PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB213PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB213PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB213PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB213PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB231PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB231PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB231PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB231PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB231PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMADDSUB231PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMADDSUB231PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMADDSUB231PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB132PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB132PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB132PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB132PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB132PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB132PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB132PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB132PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB213PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB213PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB213PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB213PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB213PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB213PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB213PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB213PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB231PDZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB231PDZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB231PDZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB231PDZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB231PSZ256m           $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUB231PSZ256m                    $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUB231PSZ256r           $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUB231PSZ256r                    $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD132PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD132PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD132PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD132PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD132PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD132PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD132PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD132PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD213PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD213PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD213PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD213PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD213PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD213PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD213PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD213PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD231PDZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD231PDZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD231PDZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD231PDZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD231PSZ256m        $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFMSUBADD231PSZ256m                 $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFMSUBADD231PSZ256r        $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFMSUBADD231PSZ256r                 $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD132PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD132PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD132PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD132PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD132PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD132PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD132PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD132PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD213PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD213PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD213PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD213PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD213PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD213PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD213PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD213PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD231PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD231PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD231PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD231PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD231PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMADD231PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMADD231PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMADD231PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB132PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB132PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB132PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB132PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB132PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB132PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB132PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB132PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB213PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB213PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB213PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB213PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB213PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB213PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB213PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB213PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB231PDZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB231PDZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB231PDZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB231PDZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB231PSZ256m          $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VFNMSUB231PSZ256m                   $ymm16, $ymm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VFNMSUB231PSZ256r          $ymm16, $ymm1, $ymm2, implicit $mxcsr
+  $ymm16 = VFNMSUB231PSZ256r                   $ymm16, $ymm1, $ymm2, implicit $mxcsr
   ; CHECK: $ymm16 = VPSRADZ256ri               $ymm16, 7
   $ymm16 = VPSRADZ256ri                        $ymm16, 7                                     
   ; CHECK: $ymm16 = VPSRADZ256rm               $ymm16, $rip, 1, $rax, 0, $noreg
@@ -3173,51 +3173,51 @@ body: |
   ; CHECK: $ymm16 = VCVTDQ2PDZ256rm            $rdi, 1, $noreg, 0, $noreg
   $ymm16 = VCVTDQ2PDZ256rm                     $rdi, 1, $noreg, 0, $noreg
   ; CHECK: $ymm16 = VCVTDQ2PDZ256rr            $xmm0
-  $ymm16 = VCVTDQ2PDZ256rr                     $xmm0                                         
-  ; CHECK: $ymm16 = VCVTDQ2PSZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VCVTDQ2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VCVTDQ2PSZ256rr            $ymm16
-  $ymm16 = VCVTDQ2PSZ256rr                     $ymm16                                        
-  ; CHECK: $xmm16 = VCVTPD2DQZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPD2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPD2DQZ256rr            $ymm16
-  $xmm16 = VCVTPD2DQZ256rr                     $ymm16                                        
-  ; CHECK: $xmm16 = VCVTPD2PSZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPD2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPD2PSZ256rr            $ymm16
-  $xmm16 = VCVTPD2PSZ256rr                     $ymm16                                        
-  ; CHECK: $ymm16 = VCVTPS2DQZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VCVTPS2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VCVTPS2DQZ256rr            $ymm16  
-  $ymm16 = VCVTPS2DQZ256rr                     $ymm16                                        
-  ; CHECK: $ymm16 = VCVTPS2PDZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VCVTPS2PDZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VCVTPS2PDZ256rr            $xmm0
-  $ymm16 = VCVTPS2PDZ256rr                     $xmm0                                               
-  ; CHECK: VCVTPS2PHZ256mr                     $rdi, 1, $noreg, 0, $noreg, $ymm16, 0
-  VCVTPS2PHZ256mr                              $rdi, 1, $noreg, 0, $noreg, $ymm16, 0
-  ; CHECK: $xmm0 = VCVTPS2PHZ256rr             $ymm16, 0
-  $xmm0 = VCVTPS2PHZ256rr                      $ymm16, 0
-  ; CHECK: $ymm16 = VCVTPH2PSZ256rm            $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VCVTPH2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VCVTPH2PSZ256rr            $xmm16
-  $ymm16 = VCVTPH2PSZ256rr                     $xmm16
-  ; CHECK: $xmm16 = VCVTTPD2DQZ256rm           $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTTPD2DQZ256rm                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTTPD2DQZ256rr           $ymm16
-  $xmm16 = VCVTTPD2DQZ256rr                    $ymm16                                        
-  ; CHECK: $ymm16 = VCVTTPS2DQZ256rm           $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VCVTTPS2DQZ256rm                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VCVTTPS2DQZ256rr           $ymm16  
-  $ymm16 = VCVTTPS2DQZ256rr                    $ymm16                                               
-  ; CHECK: $ymm16 = VSQRTPDZ256m               $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VSQRTPDZ256m                        $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VSQRTPDZ256r               $ymm16
-  $ymm16 = VSQRTPDZ256r                        $ymm16                                        
-  ; CHECK: $ymm16 = VSQRTPSZ256m               $rdi, 1, $noreg, 0, $noreg
-  $ymm16 = VSQRTPSZ256m                        $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $ymm16 = VSQRTPSZ256r               $ymm16    
-  $ymm16 = VSQRTPSZ256r                        $ymm16                                                 
+  $ymm16 = VCVTDQ2PDZ256rr                     $xmm0, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTDQ2PSZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VCVTDQ2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTDQ2PSZ256rr            $ymm16, implicit $mxcsr
+  $ymm16 = VCVTDQ2PSZ256rr                     $ymm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2DQZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPD2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2DQZ256rr            $ymm16, implicit $mxcsr
+  $xmm16 = VCVTPD2DQZ256rr                     $ymm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2PSZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPD2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2PSZ256rr            $ymm16, implicit $mxcsr
+  $xmm16 = VCVTPD2PSZ256rr                     $ymm16, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPS2DQZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VCVTPS2DQZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPS2DQZ256rr            $ymm16, implicit $mxcsr
+  $ymm16 = VCVTPS2DQZ256rr                     $ymm16, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPS2PDZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VCVTPS2PDZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPS2PDZ256rr            $xmm0, implicit $mxcsr
+  $ymm16 = VCVTPS2PDZ256rr                     $xmm0, implicit $mxcsr
+  ; CHECK: VCVTPS2PHZ256mr                     $rdi, 1, $noreg, 0, $noreg, $ymm16, 0, implicit $mxcsr
+  VCVTPS2PHZ256mr                              $rdi, 1, $noreg, 0, $noreg, $ymm16, 0, implicit $mxcsr
+  ; CHECK: $xmm0 = VCVTPS2PHZ256rr             $ymm16, 0, implicit $mxcsr
+  $xmm0 = VCVTPS2PHZ256rr                      $ymm16, 0, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPH2PSZ256rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VCVTPH2PSZ256rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTPH2PSZ256rr            $xmm16, implicit $mxcsr
+  $ymm16 = VCVTPH2PSZ256rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPD2DQZ256rm           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTTPD2DQZ256rm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPD2DQZ256rr           $ymm16, implicit $mxcsr
+  $xmm16 = VCVTTPD2DQZ256rr                    $ymm16, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTTPS2DQZ256rm           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VCVTTPS2DQZ256rm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VCVTTPS2DQZ256rr           $ymm16, implicit $mxcsr
+  $ymm16 = VCVTTPS2DQZ256rr                    $ymm16, implicit $mxcsr
+  ; CHECK: $ymm16 = VSQRTPDZ256m               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VSQRTPDZ256m                        $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VSQRTPDZ256r               $ymm16, implicit $mxcsr
+  $ymm16 = VSQRTPDZ256r                        $ymm16, implicit $mxcsr
+  ; CHECK: $ymm16 = VSQRTPSZ256m               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $ymm16 = VSQRTPSZ256m                        $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $ymm16 = VSQRTPSZ256r               $ymm16, implicit $mxcsr
+  $ymm16 = VSQRTPSZ256r                        $ymm16, implicit $mxcsr
   ; CHECK: $ymm16 = VPALIGNRZ256rmi            $ymm16, $rdi, 1, $noreg, 0, $noreg, 1
   $ymm16 = VPALIGNRZ256rmi                     $ymm16, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $ymm16 = VPALIGNRZ256rri            $ymm16, $ymm1, 1
@@ -3252,22 +3252,22 @@ body: |
   $ymm16 = VSHUFPSZ256rmi                      $ymm16, $rip, 1, $rax, 0, $noreg, -24
   ; CHECK: $ymm16 = VSHUFPSZ256rri             $ymm16, $ymm1, -24
   $ymm16 = VSHUFPSZ256rri                      $ymm16, $ymm1, -24
-  ; CHECK: $ymm16 = VRNDSCALEPDZ256rmi         $rip, 1, $rax, 0, $noreg, 15
-  $ymm16 = VRNDSCALEPDZ256rmi                  $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $ymm16 = VRNDSCALEPDZ256rri         $ymm16, 15
-  $ymm16 = VRNDSCALEPDZ256rri                  $ymm16, 15
-  ; CHECK: $ymm16 = VRNDSCALEPSZ256rmi         $rip, 1, $rax, 0, $noreg, 15
-  $ymm16 = VRNDSCALEPSZ256rmi                  $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $ymm16 = VRNDSCALEPSZ256rri         $ymm16, 15
-  $ymm16 = VRNDSCALEPSZ256rri                  $ymm16, 15
-  ; CHECK: $ymm0 = VRNDSCALEPDZ256rmi          $rip, 1, $rax, 0, $noreg, 31
-  $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $ymm0 = VRNDSCALEPDZ256rri          $ymm0, 31
-  $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 31
-  ; CHECK: $ymm0 = VRNDSCALEPSZ256rmi          $rip, 1, $rax, 0, $noreg, 31
-  $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $ymm0 = VRNDSCALEPSZ256rri          $ymm0, 31
-  $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 31
+  ; CHECK: $ymm16 = VRNDSCALEPDZ256rmi         $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $ymm16 = VRNDSCALEPDZ256rmi                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm16 = VRNDSCALEPDZ256rri         $ymm16, 15, implicit $mxcsr
+  $ymm16 = VRNDSCALEPDZ256rri                  $ymm16, 15, implicit $mxcsr
+  ; CHECK: $ymm16 = VRNDSCALEPSZ256rmi         $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $ymm16 = VRNDSCALEPSZ256rmi                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $ymm16 = VRNDSCALEPSZ256rri         $ymm16, 15, implicit $mxcsr
+  $ymm16 = VRNDSCALEPSZ256rri                  $ymm16, 15, implicit $mxcsr
+  ; CHECK: $ymm0 = VRNDSCALEPDZ256rmi          $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $ymm0 = VRNDSCALEPDZ256rmi                   $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $ymm0 = VRNDSCALEPDZ256rri          $ymm0, 31, implicit $mxcsr
+  $ymm0 = VRNDSCALEPDZ256rri                   $ymm0, 31, implicit $mxcsr
+  ; CHECK: $ymm0 = VRNDSCALEPSZ256rmi          $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $ymm0 = VRNDSCALEPSZ256rmi                   $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $ymm0 = VRNDSCALEPSZ256rri          $ymm0, 31, implicit $mxcsr
+  $ymm0 = VRNDSCALEPSZ256rri                   $ymm0, 31, implicit $mxcsr
   ; CHECK: $ymm16 = VSHUFF32X4Z256rmi          $ymm16, $rip, 1, $rax, 0, $noreg, 228
   $ymm16 = VSHUFF32X4Z256rmi                   $ymm16, $rip, 1, $rax, 0, $noreg, 228
   ; CHECK: $ymm16 = VSHUFF32X4Z256rri          $ymm16, $ymm1, 228
@@ -3446,46 +3446,46 @@ body: |
   VMOVLPSZ128mr                                $rdi, 1, $noreg, 0, $noreg, $xmm16                                  
   ; CHECK: $xmm16 = VMOVLPSZ128rm              $xmm16,  $rdi, 1, $noreg, 0, $noreg  
   $xmm16 = VMOVLPSZ128rm                       $xmm16,  $rdi, 1, $noreg, 0, $noreg                                               
-  ; CHECK: $xmm16 = VMAXCPDZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXCPDZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXCPDZ128rr              $xmm16, $xmm1  
-  $xmm16 = VMAXCPDZ128rr                       $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMAXCPSZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXCPSZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXCPSZ128rr              $xmm16, $xmm1
-  $xmm16 = VMAXCPSZ128rr                       $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMAXPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXPDZ128rr               $xmm16, $xmm1
-  $xmm16 = VMAXPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMAXPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXPSZ128rr               $xmm16, $xmm1
-  $xmm16 = VMAXPSZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMINCPDZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINCPDZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINCPDZ128rr              $xmm16, $xmm1  
-  $xmm16 = VMINCPDZ128rr                       $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMINCPSZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINCPSZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINCPSZ128rr              $xmm16, $xmm1
-  $xmm16 = VMINCPSZ128rr                       $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMINPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINPDZ128rr               $xmm16, $xmm1
-  $xmm16 = VMINPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMINPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINPSZ128rr               $xmm16, $xmm1
-  $xmm16 = VMINPSZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMULPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULPDZ128rr               $xmm16, $xmm1
-  $xmm16 = VMULPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VMULPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULPSZ128rr               $xmm16, $xmm1
-  $xmm16 = VMULPSZ128rr                        $xmm16, $xmm1                                             
+  ; CHECK: $xmm16 = VMAXCPDZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXCPDZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCPDZ128rr              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXCPDZ128rr                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCPSZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXCPSZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCPSZ128rr              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXCPSZ128rr                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCPDZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINCPDZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCPDZ128rr              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINCPDZ128rr                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCPSZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINCPSZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCPSZ128rr              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINCPSZ128rr                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
   ; CHECK: $xmm16 = VORPDZ128rm                $xmm16, $rip, 1, $rax, 0, $noreg
   $xmm16 = VORPDZ128rm                         $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VORPDZ128rr                $xmm16, $xmm1
@@ -3666,14 +3666,14 @@ body: |
   $xmm16 = VPSUBWZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VPSUBWZ128rr               $xmm16, $xmm1                            
   $xmm16 = VPSUBWZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VADDPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDPDZ128rr               $xmm16, $xmm1  
-  $xmm16 = VADDPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VADDPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDPSZ128rr               $xmm16, $xmm1
-  $xmm16 = VADDPSZ128rr                        $xmm16, $xmm1                                             
+  ; CHECK: $xmm16 = VADDPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
   ; CHECK: $xmm16 = VANDNPDZ128rm              $xmm16, $rip, 1, $rax, 0, $noreg
   $xmm16 = VANDNPDZ128rm                       $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VANDNPDZ128rr              $xmm16, $xmm1
@@ -3690,14 +3690,14 @@ body: |
   $xmm16 = VANDPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VANDPSZ128rr               $xmm16, $xmm1
   $xmm16 = VANDPSZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VDIVPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVPDZ128rr               $xmm16, $xmm1
-  $xmm16 = VDIVPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VDIVPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVPSZ128rr               $xmm16, $xmm1
-  $xmm16 = VDIVPSZ128rr                        $xmm16, $xmm1                                             
+  ; CHECK: $xmm16 = VDIVPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
   ; CHECK: $xmm16 = VPXORDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
   $xmm16 = VPXORDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VPXORDZ128rr               $xmm16, $xmm1
@@ -3706,14 +3706,14 @@ body: |
   $xmm16 = VPXORQZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VPXORQZ128rr               $xmm16, $xmm1
   $xmm16 = VPXORQZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VSUBPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBPDZ128rr               $xmm16, $xmm1
-  $xmm16 = VSUBPDZ128rr                        $xmm16, $xmm1                                             
-  ; CHECK: $xmm16 = VSUBPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBPSZ128rr               $xmm16, $xmm1                  
-  $xmm16 = VSUBPSZ128rr                        $xmm16, $xmm1                                             
+  ; CHECK: $xmm16 = VSUBPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBPDZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBPDZ128rr                        $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBPSZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBPSZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBPSZ128rr               $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBPSZ128rr                        $xmm16, $xmm1, implicit $mxcsr
   ; CHECK: $xmm16 = VXORPDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
   $xmm16 = VXORPDZ128rm                        $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VXORPDZ128rr               $xmm16, $xmm1
@@ -3794,150 +3794,150 @@ body: |
   $xmm16 = VUNPCKLPSZ128rm                     $xmm16, $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VUNPCKLPSZ128rr            $xmm16, $xmm1                                               
   $xmm16 = VUNPCKLPSZ128rr                     $xmm16, $xmm1                                                             
-  ; CHECK: $xmm16 = VFMADD132PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD132PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADD132PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD132PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADD213PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD213PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADD213PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD213PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADD231PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD231PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADD231PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADD231PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB132PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB132PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB132PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB132PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB132PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB132PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB132PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB132PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB213PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB213PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB213PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB213PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB213PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB213PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB213PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB213PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB231PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB231PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB231PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB231PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMADDSUB231PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADDSUB231PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMADDSUB231PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADDSUB231PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB132PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB132PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB132PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB132PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB213PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB213PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB213PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB213PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB231PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB231PDZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231PDZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUB231PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUB231PSZ128r           $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231PSZ128r                    $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD132PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD132PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD132PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD132PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD132PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD132PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD132PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD132PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD213PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD213PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD213PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD213PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD213PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD213PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD213PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD213PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD231PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD231PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD231PDZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD231PDZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFMSUBADD231PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUBADD231PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFMSUBADD231PSZ128r        $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUBADD231PSZ128r                 $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD132PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD132PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD132PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD132PSZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132PSZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD213PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD213PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD213PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD213PSZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213PSZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD231PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD231PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMADD231PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMADD231PSZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231PSZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB132PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB132PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB132PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB132PSZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132PSZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB213PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB213PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB213PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB213PSZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213PSZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB231PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB231PDZ128r          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB231PDZ128r                   $xmm16, $xmm1, $xmm2                                      
-  ; CHECK: $xmm16 = VFNMSUB231PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                          
-  ; CHECK: $xmm16 = VFNMSUB231PSZ128r          $xmm16, $xmm1, $xmm2 
-  $xmm16 = VFNMSUB231PSZ128r                   $xmm16, $xmm1, $xmm2                                               
+  ; CHECK: $xmm16 = VFMADD132PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB132PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB132PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB132PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB132PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB132PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB132PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB132PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB132PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB213PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB213PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB213PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB213PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB213PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB213PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB213PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB213PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB231PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB231PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB231PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB231PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB231PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADDSUB231PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADDSUB231PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADDSUB231PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231PDZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231PDZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231PDZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231PDZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231PSZ128m           $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231PSZ128m                    $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231PSZ128r           $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231PSZ128r                    $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD132PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD132PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD132PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD132PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD132PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD132PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD132PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD132PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD213PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD213PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD213PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD213PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD213PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD213PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD213PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD213PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD231PDZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD231PDZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD231PDZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD231PDZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD231PSZ128m        $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUBADD231PSZ128m                 $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUBADD231PSZ128r        $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUBADD231PSZ128r                 $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231PDZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231PDZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231PDZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231PDZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231PSZ128m          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231PSZ128m                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231PSZ128r          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231PSZ128r                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
   ; CHECK: $xmm16 = VPSLLDZ128ri               $xmm16, 7  
   $xmm16 = VPSLLDZ128ri                        $xmm16, 7                                                 
   ; CHECK: $xmm16 = VPSLLDZ128rm               $xmm16, $rip, 1, $rax, 0, $noreg
@@ -4024,50 +4024,50 @@ body: |
   $xmm16 = VPERMILPSZ128rm                     $xmm16, $rdi, 1, $noreg, 0, $noreg                                  
   ; CHECK: $xmm16 = VPERMILPSZ128rr            $xmm16, $xmm1
   $xmm16 = VPERMILPSZ128rr                     $xmm16, $xmm1                                               
-  ; CHECK: $xmm16 = VCVTPH2PSZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPH2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPH2PSZ128rr            $xmm16
-  $xmm16 = VCVTPH2PSZ128rr                     $xmm16                                                    
+  ; CHECK: $xmm16 = VCVTPH2PSZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPH2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPH2PSZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTPH2PSZ128rr                     $xmm16, implicit $mxcsr
   ; CHECK: $xmm16 = VCVTDQ2PDZ128rm            $rdi, 1, $noreg, 0, $noreg
   $xmm16 = VCVTDQ2PDZ128rm                     $rdi, 1, $noreg, 0, $noreg
   ; CHECK: $xmm16 = VCVTDQ2PDZ128rr            $xmm16     
   $xmm16 = VCVTDQ2PDZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTDQ2PSZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTDQ2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTDQ2PSZ128rr            $xmm16   
-  $xmm16 = VCVTDQ2PSZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTPD2DQZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPD2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPD2DQZ128rr            $xmm16   
-  $xmm16 = VCVTPD2DQZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTPD2PSZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPD2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPD2PSZ128rr            $xmm16   
-  $xmm16 = VCVTPD2PSZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTPS2DQZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPS2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPS2DQZ128rr            $xmm16   
-  $xmm16 = VCVTPS2DQZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTPS2PDZ128rm            $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTPS2PDZ128rm                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTPS2PDZ128rr            $xmm16
-  $xmm16 = VCVTPS2PDZ128rr                     $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTTPD2DQZ128rm           $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTTPD2DQZ128rm                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTTPD2DQZ128rr           $xmm16  
-  $xmm16 = VCVTTPD2DQZ128rr                    $xmm16                                                    
-  ; CHECK: $xmm16 = VCVTTPS2DQZ128rm           $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTTPS2DQZ128rm                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTTPS2DQZ128rr           $xmm16
-  $xmm16 = VCVTTPS2DQZ128rr                    $xmm16                                                    
-  ; CHECK: $xmm16 = VSQRTPDZ128m               $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTPDZ128m                        $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTPDZ128r               $xmm16
-  $xmm16 = VSQRTPDZ128r                        $xmm16                                                    
-  ; CHECK: $xmm16 = VSQRTPSZ128m               $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTPSZ128m                        $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTPSZ128r               $xmm16  
-  $xmm16 = VSQRTPSZ128r                        $xmm16                                                    
+  ; CHECK: $xmm16 = VCVTDQ2PSZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTDQ2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTDQ2PSZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTDQ2PSZ128rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2DQZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPD2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2DQZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTPD2DQZ128rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2PSZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPD2PSZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPD2PSZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTPD2PSZ128rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPS2DQZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPS2DQZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPS2DQZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTPS2DQZ128rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPS2PDZ128rm            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTPS2PDZ128rm                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTPS2PDZ128rr            $xmm16, implicit $mxcsr
+  $xmm16 = VCVTPS2PDZ128rr                     $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPD2DQZ128rm           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTTPD2DQZ128rm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPD2DQZ128rr           $xmm16, implicit $mxcsr
+  $xmm16 = VCVTTPD2DQZ128rr                    $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPS2DQZ128rm           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTTPS2DQZ128rm                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTTPS2DQZ128rr           $xmm16, implicit $mxcsr
+  $xmm16 = VCVTTPS2DQZ128rr                    $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTPDZ128m               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTPDZ128m                        $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTPDZ128r               $xmm16, implicit $mxcsr
+  $xmm16 = VSQRTPDZ128r                        $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTPSZ128m               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTPSZ128m                        $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTPSZ128r               $xmm16, implicit $mxcsr
+  $xmm16 = VSQRTPSZ128r                        $xmm16, implicit $mxcsr
   ; CHECK: $xmm16 = VMOVDDUPZ128rm             $rdi, 1, $noreg, 0, $noreg     
   $xmm16 = VMOVDDUPZ128rm                      $rdi, 1, $noreg, 0, $noreg                                          
   ; CHECK: $xmm16 = VMOVDDUPZ128rr             $xmm16    
@@ -4134,10 +4134,10 @@ body: |
   $xmm16 = VBROADCASTI32X2Z128m                $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VBROADCASTI32X2Z128r       $xmm0
   $xmm16 = VBROADCASTI32X2Z128r                $xmm0
-  ; CHECK: $xmm16 = VCVTPS2PHZ128rr            $xmm16, 2
-  $xmm16 = VCVTPS2PHZ128rr                     $xmm16, 2                                                 
-  ; CHECK: VCVTPS2PHZ128mr                     $rdi, 1, $noreg, 0, $noreg, $xmm16, 2
-  VCVTPS2PHZ128mr                              $rdi, 1, $noreg, 0, $noreg, $xmm16, 2
+  ; CHECK: $xmm16 = VCVTPS2PHZ128rr            $xmm16, 2, implicit $mxcsr
+  $xmm16 = VCVTPS2PHZ128rr                     $xmm16, 2, implicit $mxcsr
+  ; CHECK: VCVTPS2PHZ128mr                     $rdi, 1, $noreg, 0, $noreg, $xmm16, 2, implicit $mxcsr
+  VCVTPS2PHZ128mr                              $rdi, 1, $noreg, 0, $noreg, $xmm16, 2, implicit $mxcsr
   ; CHECK: $xmm16 = VPABSBZ128rm               $rip, 1, $rax, 0, $noreg
   $xmm16 = VPABSBZ128rm                        $rip, 1, $rax, 0, $noreg
   ; CHECK: $xmm16 = VPABSBZ128rr               $xmm16
@@ -4162,22 +4162,22 @@ body: |
   $xmm16 = VINSERTPSZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg, 1
   ; CHECK: $xmm16 = VINSERTPSZrr               $xmm16, $xmm16, 1
   $xmm16 = VINSERTPSZrr                        $xmm16, $xmm16, 1
-  ; CHECK: $xmm16 = VRNDSCALEPDZ128rmi         $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALEPDZ128rmi                  $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALEPDZ128rri         $xmm16, 15
-  $xmm16 = VRNDSCALEPDZ128rri                  $xmm16, 15
-  ; CHECK: $xmm16 = VRNDSCALEPSZ128rmi         $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALEPSZ128rmi                  $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALEPSZ128rri         $xmm16, 15
-  $xmm16 = VRNDSCALEPSZ128rri                  $xmm16, 15
-  ; CHECK: $xmm0 = VRNDSCALEPDZ128rmi          $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALEPDZ128rri          $xmm0, 31
-  $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 31
-  ; CHECK: $xmm0 = VRNDSCALEPSZ128rmi          $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALEPSZ128rri          $xmm0, 31
-  $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 31
+  ; CHECK: $xmm16 = VRNDSCALEPDZ128rmi         $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALEPDZ128rmi                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALEPDZ128rri         $xmm16, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALEPDZ128rri                  $xmm16, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALEPSZ128rmi         $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALEPSZ128rmi                  $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALEPSZ128rri         $xmm16, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALEPSZ128rri                  $xmm16, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALEPDZ128rmi          $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALEPDZ128rmi                   $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALEPDZ128rri          $xmm0, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALEPDZ128rri                   $xmm0, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALEPSZ128rmi          $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALEPSZ128rmi                   $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALEPSZ128rri          $xmm0, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALEPSZ128rri                   $xmm0, 31, implicit $mxcsr
     
       RET 0, $zmm0, $zmm1
 ...
@@ -4188,310 +4188,310 @@ body: |
 name: evex_scalar_to_evex_test
 body: |
   bb.0:
-  ; CHECK: $xmm16 = VADDSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDSDZrr                  $xmm16, $xmm1  
-  $xmm16 = VADDSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VADDSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VADDSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VADDSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VADDSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VADDSSZrr                  $xmm16, $xmm1
-  $xmm16 = VADDSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VADDSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VADDSSZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VDIVSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVSDZrr                  $xmm16, $xmm1  
-  $xmm16 = VDIVSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VDIVSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VDIVSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VDIVSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VDIVSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VDIVSSZrr                  $xmm16, $xmm1
-  $xmm16 = VDIVSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VDIVSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VDIVSSZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXCSDZrm                 $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXCSDZrm                          $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXCSDZrr                 $xmm16, $xmm1
-  $xmm16 = VMAXCSDZrr                          $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXCSSZrm                 $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXCSSZrm                          $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXCSSZrr                 $xmm16, $xmm1
-  $xmm16 = VMAXCSSZrr                          $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXSDZrr                  $xmm16, $xmm1
-  $xmm16 = VMAXSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMAXSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMAXSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMAXSSZrr                  $xmm16, $xmm1
-  $xmm16 = VMAXSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMAXSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMAXSSZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINCSDZrm                 $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINCSDZrm                          $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINCSDZrr                 $xmm16, $xmm1
-  $xmm16 = VMINCSDZrr                          $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINCSSZrm                 $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINCSSZrm                          $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINCSSZrr                 $xmm16, $xmm1
-  $xmm16 = VMINCSSZrr                          $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINSDZrr                  $xmm16, $xmm1
-  $xmm16 = VMINSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMINSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMINSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMINSSZrr                  $xmm16, $xmm1
-  $xmm16 = VMINSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMINSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMINSSZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMULSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULSDZrr                  $xmm16, $xmm1
-  $xmm16 = VMULSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMULSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMULSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMULSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VMULSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VMULSSZrr                  $xmm16, $xmm1  
-  $xmm16 = VMULSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VMULSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VMULSSZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VSUBSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBSDZrr                  $xmm16, $xmm1  
-  $xmm16 = VSUBSDZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VSUBSDZrr_Int              $xmm16, $xmm1
-  $xmm16 = VSUBSDZrr_Int                       $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VSUBSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg
-  $xmm16 = VSUBSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg
-  ; CHECK: $xmm16 = VSUBSSZrr                  $xmm16, $xmm1
-  $xmm16 = VSUBSSZrr                           $xmm16, $xmm1                                              
-  ; CHECK: $xmm16 = VSUBSSZrr_Int              $xmm16, $xmm1
-  $xmm16 = VSUBSSZrr_Int                       $xmm16, $xmm1                                               
-  ; CHECK: $xmm16 = VFMADD132SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD132SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD132SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD132SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD132SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD132SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD132SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD132SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD132SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD132SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD213SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD213SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD213SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD213SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD213SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD213SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD213SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD213SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD213SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD213SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD231SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD231SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD231SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD231SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD231SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD231SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMADD231SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMADD231SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMADD231SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMADD231SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB132SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB132SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB132SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB132SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB132SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB132SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB132SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB132SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB132SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB132SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB213SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB213SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB213SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB213SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB213SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB213SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB213SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB213SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB213SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB213SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB231SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB231SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB231SDZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231SDZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB231SDZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231SDZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB231SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB231SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFMSUB231SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFMSUB231SSZr              $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231SSZr                       $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFMSUB231SSZr_Int          $xmm16, $xmm1, $xmm2
-  $xmm16 = VFMSUB231SSZr_Int                   $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD132SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD132SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD132SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD132SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD132SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD132SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD132SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD132SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD132SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD132SSZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD213SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD213SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD213SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD213SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD213SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD213SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD213SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD213SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD213SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD213SSZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD231SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD231SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD231SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD231SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD231SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD231SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMADD231SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMADD231SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMADD231SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMADD231SSZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB132SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB132SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB132SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB132SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB132SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB132SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB132SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB132SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB132SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB132SSZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB213SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB213SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB213SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB213SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB213SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB213SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB213SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB213SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB213SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB213SSZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB231SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB231SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB231SDZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB231SDZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB231SDZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB231SDZr_Int                  $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB231SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB231SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg
-  $xmm16 = VFNMSUB231SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg                           
-  ; CHECK: $xmm16 = VFNMSUB231SSZr             $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB231SSZr                      $xmm16, $xmm1, $xmm2                                       
-  ; CHECK: $xmm16 = VFNMSUB231SSZr_Int         $xmm16, $xmm1, $xmm2
-  $xmm16 = VFNMSUB231SSZr_Int                  $xmm16, $xmm1, $xmm2                                               
+  ; CHECK: $xmm16 = VADDSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VADDSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VADDSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VADDSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VDIVSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VDIVSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VDIVSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCSDZrm                 $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXCSDZrm                          $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCSDZrr                 $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXCSDZrr                          $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCSSZrm                 $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXCSSZrm                          $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXCSSZrr                 $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXCSSZrr                          $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMAXSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMAXSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMAXSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCSDZrm                 $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINCSDZrm                          $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCSDZrr                 $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINCSDZrr                          $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCSSZrm                 $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINCSSZrm                          $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINCSSZrr                 $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINCSSZrr                          $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMINSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMINSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMINSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VMULSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VMULSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VMULSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSDZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBSDZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSDZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBSDZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSDZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBSDZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSDZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBSDZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSSZrm                  $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBSSZrm                           $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSSZrm_Int              $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSUBSSZrm_Int                       $xmm16, $rip, 1, $rax, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSSZrr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBSSZrr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSUBSSZrr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSUBSSZrr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD132SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD132SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD132SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD213SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD213SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD213SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMADD231SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMADD231SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMADD231SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB132SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB132SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB132SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB213SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB213SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB213SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SDZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231SDZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SDZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231SDZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SDZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231SDZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SDZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231SDZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SSZm              $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231SSZm                       $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SSZm_Int          $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFMSUB231SSZm_Int                   $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SSZr              $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231SSZr                       $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFMSUB231SSZr_Int          $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFMSUB231SSZr_Int                   $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD132SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD132SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD132SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD213SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD213SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD213SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMADD231SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMADD231SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMADD231SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB132SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB132SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB132SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB213SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB213SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB213SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SDZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231SDZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SDZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231SDZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SDZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231SDZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SDZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231SDZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SSZm             $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231SSZm                      $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SSZm_Int         $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VFNMSUB231SSZm_Int                  $xmm16, $xmm16, $rsi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SSZr             $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231SSZr                      $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  ; CHECK: $xmm16 = VFNMSUB231SSZr_Int         $xmm16, $xmm1, $xmm2, implicit $mxcsr
+  $xmm16 = VFNMSUB231SSZr_Int                  $xmm16, $xmm1, $xmm2, implicit $mxcsr
   ; CHECK: VPEXTRBZmr                          $rdi, 1, $noreg, 0, $noreg, $xmm16, 3       
   VPEXTRBZmr                                   $rdi, 1, $noreg, 0, $noreg, $xmm16, 3                                
   ; CHECK: $eax = VPEXTRBZrr                   $xmm16, 1    
@@ -4526,38 +4526,38 @@ body: |
   $xmm16 = VPINSRWZrm                          $xmm16, $rsi, 1, $noreg, 0, $noreg, 3                                
   ; CHECK: $xmm16 = VPINSRWZrr                 $xmm16, $edi, 5
   $xmm16 = VPINSRWZrr                          $xmm16, $edi, 5                                               
-  ; CHECK: $xmm16 = VSQRTSDZm                  $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTSDZm                           $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTSDZm_Int              $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTSDZm_Int                       $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTSDZr                  $xmm16, $xmm1 
-  $xmm16 = VSQRTSDZr                           $xmm16, $xmm1                                                  
-  ; CHECK: $xmm16 = VSQRTSDZr_Int              $xmm16, $xmm1
-  $xmm16 = VSQRTSDZr_Int                       $xmm16, $xmm1                                                  
-  ; CHECK: $xmm16 = VSQRTSSZm                  $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTSSZm                           $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTSSZm_Int              $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VSQRTSSZm_Int                       $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VSQRTSSZr                  $xmm16, $xmm1
-  $xmm16 = VSQRTSSZr                           $xmm16, $xmm1                                                  
-  ; CHECK: $xmm16 = VSQRTSSZr_Int              $xmm16, $xmm1
-  $xmm16 = VSQRTSSZr_Int                       $xmm16, $xmm1                                                  
-  ; CHECK: $rdi = VCVTSD2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTSD2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTSD2SI64Zrr_Int           $xmm16
-  $rdi = VCVTSD2SI64Zrr_Int                    $xmm16                                                     
-  ; CHECK: $edi = VCVTSD2SIrm_Int              $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTSD2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTSD2SIZrr_Int             $xmm16
-  $edi = VCVTSD2SIZrr_Int                      $xmm16                                                     
-  ; CHECK: $xmm16 = VCVTSD2SSZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSD2SSZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSD2SSZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSD2SSZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSD2SSZrr               $xmm16, $noreg
-  $xmm16 = VCVTSD2SSZrr                        $xmm16, $noreg                                                  
-  ; CHECK: $xmm16 = VCVTSD2SSZrr_Int           $xmm16, $noreg
-  $xmm16 = VCVTSD2SSZrr_Int                    $xmm16, $noreg                                                  
+  ; CHECK: $xmm16 = VSQRTSDZm                  $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTSDZm                           $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSDZm_Int              $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTSDZm_Int                       $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSDZr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSQRTSDZr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSDZr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSQRTSDZr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSSZm                  $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTSSZm                           $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSSZm_Int              $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VSQRTSSZm_Int                       $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSSZr                  $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSQRTSSZr                           $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $xmm16 = VSQRTSSZr_Int              $xmm16, $xmm1, implicit $mxcsr
+  $xmm16 = VSQRTSSZr_Int                       $xmm16, $xmm1, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSD2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTSD2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSD2SI64Zrr_Int           $xmm16, implicit $mxcsr
+  $rdi = VCVTSD2SI64Zrr_Int                    $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTSD2SIrm_Int              $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTSD2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTSD2SIZrr_Int             $xmm16, implicit $mxcsr
+  $edi = VCVTSD2SIZrr_Int                      $xmm16, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSD2SSZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSD2SSZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSD2SSZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSD2SSZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSD2SSZrr               $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSD2SSZrr                        $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSD2SSZrr_Int           $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSD2SSZrr_Int                    $xmm16, $noreg, implicit $mxcsr
   ; CHECK: $xmm16 = VCVTSI2SDZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg
   $xmm16 = VCVTSI2SDZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
   ; CHECK: $xmm16 = VCVTSI2SDZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg
@@ -4566,78 +4566,78 @@ body: |
   $xmm16 = VCVTSI2SDZrr                        $xmm16, $noreg                                                  
   ; CHECK: $xmm16 = VCVTSI2SDZrr_Int           $xmm16, $noreg
   $xmm16 = VCVTSI2SDZrr_Int                    $xmm16, $noreg                                                  
-  ; CHECK: $xmm16 = VCVTSI2SSZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSI2SSZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSI2SSZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSI2SSZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSI2SSZrr               $xmm16, $noreg
-  $xmm16 = VCVTSI2SSZrr                        $xmm16, $noreg                                                  
-  ; CHECK: $xmm16 = VCVTSI2SSZrr_Int           $xmm16, $noreg
-  $xmm16 = VCVTSI2SSZrr_Int                    $xmm16, $noreg                                                  
-  ; CHECK: $xmm16 = VCVTSI642SDZrm             $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSI642SDZrm                      $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SDZrm_Int         $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSI642SDZrm_Int                  $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SDZrr             $xmm16, $noreg
-  $xmm16 = VCVTSI642SDZrr                      $xmm16, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SDZrr_Int         $xmm16, $noreg
-  $xmm16 = VCVTSI642SDZrr_Int                  $xmm16, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SSZrm             $xmm16, $rdi, 1, $noreg, 0, $noreg 
-  $xmm16 = VCVTSI642SSZrm                      $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SSZrm_Int         $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSI642SSZrm_Int                  $xmm16, $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SSZrr             $xmm16, $noreg 
-  $xmm16 = VCVTSI642SSZrr                      $xmm16, $noreg
-  ; CHECK: $xmm16 = VCVTSI642SSZrr_Int         $xmm16, $noreg
-  $xmm16 = VCVTSI642SSZrr_Int                  $xmm16, $noreg
-  ; CHECK: $xmm16 = VCVTSS2SDZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSS2SDZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSS2SDZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg
-  $xmm16 = VCVTSS2SDZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg                                   
-  ; CHECK: $xmm16 = VCVTSS2SDZrr               $xmm16, $noreg
-  $xmm16 = VCVTSS2SDZrr                        $xmm16, $noreg                                                  
-  ; CHECK: $xmm16 = VCVTSS2SDZrr_Int           $xmm16, $noreg
-  $xmm16 = VCVTSS2SDZrr_Int                    $xmm16, $noreg                                                  
-  ; CHECK: $rdi = VCVTSS2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTSS2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTSS2SI64Zrr_Int           $xmm16
-  $rdi = VCVTSS2SI64Zrr_Int                    $xmm16                                                     
-  ; CHECK: $edi = VCVTSS2SIrm_Int              $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTSS2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTSS2SIZrr_Int             $xmm16
-  $edi = VCVTSS2SIZrr_Int                      $xmm16                                                     
-  ; CHECK: $rdi = VCVTTSD2SI64rm               $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSD2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSD2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSD2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSD2SI64Zrr              $xmm16
-  $rdi = VCVTTSD2SI64Zrr                       $xmm16                                                     
-  ; CHECK: $rdi = VCVTTSD2SI64Zrr_Int          $xmm16
-  $rdi = VCVTTSD2SI64Zrr_Int                   $xmm16                                                     
-  ; CHECK: $edi = VCVTTSD2SIrm                 $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSD2SIZrm                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSD2SIrm_Int             $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSD2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSD2SIZrr                $xmm16
-  $edi = VCVTTSD2SIZrr                         $xmm16                                                     
-  ; CHECK: $edi = VCVTTSD2SIZrr_Int            $xmm16
-  $edi = VCVTTSD2SIZrr_Int                     $xmm16                                                     
-  ; CHECK: $rdi = VCVTTSS2SI64rm               $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSS2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSS2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg
-  $rdi = VCVTTSS2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $rdi = VCVTTSS2SI64Zrr              $xmm16
-  $rdi = VCVTTSS2SI64Zrr                       $xmm16                                                     
-  ; CHECK: $rdi = VCVTTSS2SI64Zrr_Int          $xmm16
-  $rdi = VCVTTSS2SI64Zrr_Int                   $xmm16                                                     
-  ; CHECK: $edi = VCVTTSS2SIrm                 $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSS2SIZrm                         $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSS2SIrm_Int             $rdi, 1, $noreg, 0, $noreg
-  $edi = VCVTTSS2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg
-  ; CHECK: $edi = VCVTTSS2SIZrr                $xmm16
-  $edi = VCVTTSS2SIZrr                         $xmm16                                                     
-  ; CHECK: $edi = VCVTTSS2SIZrr_Int            $xmm16  
-  $edi = VCVTTSS2SIZrr_Int                     $xmm16                                                     
+  ; CHECK: $xmm16 = VCVTSI2SSZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI2SSZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI2SSZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI2SSZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI2SSZrr               $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI2SSZrr                        $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI2SSZrr_Int           $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI2SSZrr_Int                    $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SDZrm             $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SDZrm                      $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SDZrm_Int         $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SDZrm_Int                  $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SDZrr             $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SDZrr                      $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SDZrr_Int         $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SDZrr_Int                  $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SSZrm             $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SSZrm                      $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SSZrm_Int         $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SSZrm_Int                  $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SSZrr             $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SSZrr                      $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSI642SSZrr_Int         $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSI642SSZrr_Int                  $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSS2SDZrm               $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSS2SDZrm                        $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSS2SDZrm_Int           $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSS2SDZrm_Int                    $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSS2SDZrr               $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSS2SDZrr                        $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $xmm16 = VCVTSS2SDZrr_Int           $xmm16, $noreg, implicit $mxcsr
+  $xmm16 = VCVTSS2SDZrr_Int                    $xmm16, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSS2SI64rm_Int            $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTSS2SI64Zrm_Int                    $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTSS2SI64Zrr_Int           $xmm16, implicit $mxcsr
+  $rdi = VCVTSS2SI64Zrr_Int                    $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTSS2SIrm_Int              $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTSS2SIZrm_Int                      $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTSS2SIZrr_Int             $xmm16, implicit $mxcsr
+  $edi = VCVTSS2SIZrr_Int                      $xmm16, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64Zrr              $xmm16, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrr                       $xmm16, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSD2SI64Zrr_Int          $xmm16, implicit $mxcsr
+  $rdi = VCVTTSD2SI64Zrr_Int                   $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSD2SIZrm                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIrm_Int             $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSD2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIZrr                $xmm16, implicit $mxcsr
+  $edi = VCVTTSD2SIZrr                         $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSD2SIZrr_Int            $xmm16, implicit $mxcsr
+  $edi = VCVTTSD2SIZrr_Int                     $xmm16, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rm               $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrm                       $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64rm_Int           $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrm_Int                   $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64Zrr              $xmm16, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrr                       $xmm16, implicit $mxcsr
+  ; CHECK: $rdi = VCVTTSS2SI64Zrr_Int          $xmm16, implicit $mxcsr
+  $rdi = VCVTTSS2SI64Zrr_Int                   $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrm                 $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSS2SIZrm                         $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIrm_Int             $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  $edi = VCVTTSS2SIZrm_Int                     $rdi, 1, $noreg, 0, $noreg, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIZrr                $xmm16, implicit $mxcsr
+  $edi = VCVTTSS2SIZrr                         $xmm16, implicit $mxcsr
+  ; CHECK: $edi = VCVTTSS2SIZrr_Int            $xmm16, implicit $mxcsr
+  $edi = VCVTTSS2SIZrr_Int                     $xmm16, implicit $mxcsr
   ; CHECK: $xmm16 = VMOV64toSDZrr              $rdi    
   $xmm16 = VMOV64toSDZrr                       $rdi                                                       
   ; CHECK: $xmm16 = VMOVDI2SSZrr               $eax
@@ -4728,38 +4728,38 @@ body: |
   VUCOMISSZrm                                  $xmm16, $rdi, 1, $noreg, 0, $noreg, implicit-def $eflags, implicit $mxcsr
   ; CHECK: VUCOMISSZrr                         $xmm16, $xmm1, implicit-def $eflags, implicit $mxcsr
   VUCOMISSZrr                                  $xmm16, $xmm1, implicit-def $eflags, implicit $mxcsr
-  ; CHECK: $xmm16 = VRNDSCALESDZm              $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALESDZm                       $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALESDZr              $xmm16, $xmm1, 15
-  $xmm16 = VRNDSCALESDZr                       $xmm16, $xmm1, 15
-  ; CHECK: $xmm16 = VRNDSCALESSZm              $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALESSZm                       $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALESSZr              $xmm16, $xmm1, 15
-  $xmm16 = VRNDSCALESSZr                       $xmm16, $xmm1, 15
-  ; CHECK: $xmm16 = VRNDSCALESDZm_Int          $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALESDZm_Int                   $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALESDZr_Int          $xmm16, $xmm1, 15
-  $xmm16 = VRNDSCALESDZr_Int                   $xmm16, $xmm1, 15
-  ; CHECK: $xmm16 = VRNDSCALESSZm_Int          $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  $xmm16 = VRNDSCALESSZm_Int                   $xmm16, $rip, 1, $rax, 0, $noreg, 15
-  ; CHECK: $xmm16 = VRNDSCALESSZr_Int          $xmm16, $xmm1, 15
-  $xmm16 = VRNDSCALESSZr_Int                   $xmm16, $xmm1, 15
-  ; CHECK: $xmm0 = VRNDSCALESDZm               $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALESDZr               $xmm0, $xmm1, 31
-  $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 31
-  ; CHECK: $xmm0 = VRNDSCALESSZm               $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALESSZr               $xmm0, $xmm1, 31
-  $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 31
-  ; CHECK: $xmm0 = VRNDSCALESDZm_Int           $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALESDZr_Int           $xmm0, $xmm1, 31
-  $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 31
-  ; CHECK: $xmm0 = VRNDSCALESSZm_Int           $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 31
-  ; CHECK: $xmm0 = VRNDSCALESSZr_Int           $xmm0, $xmm1, 31
-  $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 31
+  ; CHECK: $xmm16 = VRNDSCALESDZm              $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESDZm                       $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESDZr              $xmm16, $xmm1, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESDZr                       $xmm16, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESSZm              $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESSZm                       $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESSZr              $xmm16, $xmm1, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESSZr                       $xmm16, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESDZm_Int          $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESDZm_Int                   $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESDZr_Int          $xmm16, $xmm1, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESDZr_Int                   $xmm16, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESSZm_Int          $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESSZm_Int                   $xmm16, $rip, 1, $rax, 0, $noreg, 15, implicit $mxcsr
+  ; CHECK: $xmm16 = VRNDSCALESSZr_Int          $xmm16, $xmm1, 15, implicit $mxcsr
+  $xmm16 = VRNDSCALESSZr_Int                   $xmm16, $xmm1, 15, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESDZm               $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESDZr               $xmm0, $xmm1, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZr                        $xmm0, $xmm1, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESSZm               $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZm                        $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESSZr               $xmm0, $xmm1, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZr                        $xmm0, $xmm1, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESDZm_Int           $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESDZr_Int           $xmm0, $xmm1, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESDZr_Int                    $xmm0, $xmm1, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESSZm_Int           $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZm_Int                    $xmm0, $rip, 1, $rax, 0, $noreg, 31, implicit $mxcsr
+  ; CHECK: $xmm0 = VRNDSCALESSZr_Int           $xmm0, $xmm1, 31, implicit $mxcsr
+  $xmm0 = VRNDSCALESSZr_Int                    $xmm0, $xmm1, 31, implicit $mxcsr
   
       RET 0, $zmm0, $zmm1
 ...
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 58041c29ab64c..011d235c39f62 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -1084,6 +1084,81 @@ entry:
   ret i64 %result
 }
 
+; Verify that fptoui(%x) isn't simplified when the rounding mode is
+; unknown.
+; Verify that no gross errors happen.
+define i128 @f20s128(double %x) nounwind strictfp {
+; X87-LABEL: f20s128:
+; X87:       # %bb.0: # %entry
+; X87-NEXT:    pushl %edi
+; X87-NEXT:    pushl %esi
+; X87-NEXT:    subl $36, %esp
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    calll __fixdfti
+; X87-NEXT:    subl $4, %esp
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X87-NEXT:    movl %edi, 8(%esi)
+; X87-NEXT:    movl %edx, 12(%esi)
+; X87-NEXT:    movl %eax, (%esi)
+; X87-NEXT:    movl %ecx, 4(%esi)
+; X87-NEXT:    movl %esi, %eax
+; X87-NEXT:    addl $36, %esp
+; X87-NEXT:    popl %esi
+; X87-NEXT:    popl %edi
+; X87-NEXT:    retl $4
+;
+; X86-SSE-LABEL: f20s128:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %edi
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __fixdfti
+; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT:    movl %edi, 8(%esi)
+; X86-SSE-NEXT:    movl %edx, 12(%esi)
+; X86-SSE-NEXT:    movl %eax, (%esi)
+; X86-SSE-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    popl %edi
+; X86-SSE-NEXT:    retl $4
+;
+; SSE-LABEL: f20s128:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    callq __fixdfti
+; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: f20s128:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    callq __fixdfti
+; AVX-NEXT:    popq %rcx
+; AVX-NEXT:    retq
+entry:
+  %result = call i128 @llvm.experimental.constrained.fptosi.i128.f64(double %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i128 %result
+}
+
 ; Verify that fptoui(%x) isn't simplified when the rounding mode is
 ; unknown.
 ; Verify that no gross errors happen.
@@ -1348,6 +1423,82 @@ entry:
   ret i64 %result
 }
 
+
+; Verify that fptoui(%x) isn't simplified when the rounding mode is
+; unknown.
+; Verify that no gross errors happen.
+define i128 @f20u128(double %x) nounwind strictfp {
+; X87-LABEL: f20u128:
+; X87:       # %bb.0: # %entry
+; X87-NEXT:    pushl %edi
+; X87-NEXT:    pushl %esi
+; X87-NEXT:    subl $36, %esp
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    calll __fixunsdfti
+; X87-NEXT:    subl $4, %esp
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X87-NEXT:    movl %edi, 8(%esi)
+; X87-NEXT:    movl %edx, 12(%esi)
+; X87-NEXT:    movl %eax, (%esi)
+; X87-NEXT:    movl %ecx, 4(%esi)
+; X87-NEXT:    movl %esi, %eax
+; X87-NEXT:    addl $36, %esp
+; X87-NEXT:    popl %esi
+; X87-NEXT:    popl %edi
+; X87-NEXT:    retl $4
+;
+; X86-SSE-LABEL: f20u128:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    pushl %edi
+; X86-SSE-NEXT:    pushl %esi
+; X86-SSE-NEXT:    subl $36, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl %eax, (%esp)
+; X86-SSE-NEXT:    calll __fixunsdfti
+; X86-SSE-NEXT:    subl $4, %esp
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-SSE-NEXT:    movl %edi, 8(%esi)
+; X86-SSE-NEXT:    movl %edx, 12(%esi)
+; X86-SSE-NEXT:    movl %eax, (%esi)
+; X86-SSE-NEXT:    movl %ecx, 4(%esi)
+; X86-SSE-NEXT:    movl %esi, %eax
+; X86-SSE-NEXT:    addl $36, %esp
+; X86-SSE-NEXT:    popl %esi
+; X86-SSE-NEXT:    popl %edi
+; X86-SSE-NEXT:    retl $4
+;
+; SSE-LABEL: f20u128:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    callq __fixunsdfti
+; SSE-NEXT:    popq %rcx
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: f20u128:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    callq __fixunsdfti
+; AVX-NEXT:    popq %rcx
+; AVX-NEXT:    retq
+entry:
+  %result = call i128 @llvm.experimental.constrained.fptoui.i128.f64(double %x,
+                                               metadata !"fpexcept.strict") #0
+  ret i128 %result
+}
+
 ; Verify that round(42.1) isn't simplified when the rounding mode is
 ; unknown.
 ; Verify that no gross errors happen.
@@ -1823,10 +1974,12 @@ declare i8  @llvm.experimental.constrained.fptosi.i8.f64(double, metadata)
 declare i16 @llvm.experimental.constrained.fptosi.i16.f64(double, metadata)
 declare i32 @llvm.experimental.constrained.fptosi.i32.f64(double, metadata)
 declare i64 @llvm.experimental.constrained.fptosi.i64.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptosi.i128.f64(double, metadata)
 declare i8  @llvm.experimental.constrained.fptoui.i8.f64(double, metadata)
 declare i16 @llvm.experimental.constrained.fptoui.i16.f64(double, metadata)
 declare i32 @llvm.experimental.constrained.fptoui.i32.f64(double, metadata)
 declare i64 @llvm.experimental.constrained.fptoui.i64.f64(double, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
 declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
 declare i32 @llvm.experimental.constrained.lrint.i32.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar.ll b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
index 724095e8aca39..a61f195735ef9 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar.ll
@@ -5,7 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X64
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X86
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -mattr=+avx512vl -O3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX-X64
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 | FileCheck %s --check-prefixes=CHECK,X87
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,X87
 
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
 declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
@@ -16,7 +16,7 @@ declare float @llvm.experimental.constrained.fmul.f32(float, float, metadata, me
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare float @llvm.experimental.constrained.fdiv.f32(float, float, metadata, metadata)
 declare double @llvm.experimental.constrained.fpext.f64.f32(float, metadata)
-declare float @llvm.experimental.constrained.fptrunc.f64.f32(double, metadata, metadata)
+declare float @llvm.experimental.constrained.fptrunc.f32.f64(double, metadata, metadata)
 declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
 declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
 
@@ -70,8 +70,8 @@ define double @fadd_f64(double %a, double %b) nounwind strictfp {
   ret double %ret
 }
 
-define float @fadd_fsub_f32(float %a, float %b) nounwind strictfp {
-; SSE-X86-LABEL: fadd_fsub_f32:
+define float @fadd_f32(float %a, float %b) nounwind strictfp {
+; SSE-X86-LABEL: fadd_f32:
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    pushl %eax
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -81,12 +81,12 @@ define float @fadd_fsub_f32(float %a, float %b) nounwind strictfp {
 ; SSE-X86-NEXT:    popl %eax
 ; SSE-X86-NEXT:    retl
 ;
-; SSE-X64-LABEL: fadd_fsub_f32:
+; SSE-X64-LABEL: fadd_f32:
 ; SSE-X64:       # %bb.0:
 ; SSE-X64-NEXT:    addss %xmm1, %xmm0
 ; SSE-X64-NEXT:    retq
 ;
-; AVX-X86-LABEL: fadd_fsub_f32:
+; AVX-X86-LABEL: fadd_f32:
 ; AVX-X86:       # %bb.0:
 ; AVX-X86-NEXT:    pushl %eax
 ; AVX-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -96,12 +96,12 @@ define float @fadd_fsub_f32(float %a, float %b) nounwind strictfp {
 ; AVX-X86-NEXT:    popl %eax
 ; AVX-X86-NEXT:    retl
 ;
-; AVX-X64-LABEL: fadd_fsub_f32:
+; AVX-X64-LABEL: fadd_f32:
 ; AVX-X64:       # %bb.0:
 ; AVX-X64-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-X64-NEXT:    retq
 ;
-; X87-LABEL: fadd_fsub_f32:
+; X87-LABEL: fadd_f32:
 ; X87:       # %bb.0:
 ; X87-NEXT:    flds {{[0-9]+}}(%esp)
 ; X87-NEXT:    fadds {{[0-9]+}}(%esp)
@@ -480,7 +480,7 @@ define void @fptrunc_double_to_f32(double* %val, float *%ret) nounwind strictfp
 ; X87-NEXT:    popl %eax
 ; X87-NEXT:    retl
   %1 = load double, double* %val, align 8
-  %res = call float @llvm.experimental.constrained.fptrunc.f64.f32(double %1,
+  %res = call float @llvm.experimental.constrained.fptrunc.f32.f64(double %1,
                                                                    metadata !"round.dynamic",
                                                                    metadata !"fpexcept.strict") #0
   store float %res, float* %ret, align 4
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
new file mode 100644
index 0000000000000..2173ff369a927
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-android -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck %s --check-prefixes=X64,X64-SSE
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-android -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-gnu -mattr=+avx | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-android -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=x86_64-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=X64,X64-AVX
+; RUN: llc < %s -disable-strictnode-mutation -mtriple=i686-linux-gnu -mattr=-sse | FileCheck %s --check-prefixes=X86
+
+; Check soft floating point conversion function calls.
+
+@vf32 = common global float 0.000000e+00, align 4
+@vf64 = common global double 0.000000e+00, align 8
+@vf80 = common global x86_fp80 0xK00000000000000000000, align 8
+@vf128 = common global fp128 0xL00000000000000000000000000000000, align 16
+
+define void @TestFPExtF32_F128() nounwind strictfp {
+; X64-SSE-LABEL: TestFPExtF32_F128:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    callq __extendsftf2
+; X64-SSE-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPExtF32_F128:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    callq __extendsftf2
+; X64-AVX-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPExtF32_F128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    flds vf32
+; X86-NEXT:    fstps {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __extendsftf2
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, vf128+8
+; X86-NEXT:    movl %edx, vf128+12
+; X86-NEXT:    movl %eax, vf128
+; X86-NEXT:    movl %ecx, vf128+4
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+entry:
+  %0 = load float, float* @vf32, align 4
+  %conv = call fp128 @llvm.experimental.constrained.fpext.f128.f32(float %0, metadata !"fpexcept.strict") #0
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+}
+
+define void @TestFPExtF64_F128() nounwind strictfp {
+; X64-SSE-LABEL: TestFPExtF64_F128:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    callq __extenddftf2
+; X64-SSE-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPExtF64_F128:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    callq __extenddftf2
+; X64-AVX-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPExtF64_F128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    fldl vf64
+; X86-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __extenddftf2
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, vf128+8
+; X86-NEXT:    movl %edx, vf128+12
+; X86-NEXT:    movl %eax, vf128
+; X86-NEXT:    movl %ecx, vf128+4
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+entry:
+  %0 = load double, double* @vf64, align 8
+  %conv = call fp128 @llvm.experimental.constrained.fpext.f128.f64(double %0, metadata !"fpexcept.strict") #0
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+}
+
+define void @TestFPExtF80_F128() nounwind strictfp {
+; X64-SSE-LABEL: TestFPExtF80_F128:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    subq $24, %rsp
+; X64-SSE-NEXT:    fldt {{.*}}(%rip)
+; X64-SSE-NEXT:    fstpt (%rsp)
+; X64-SSE-NEXT:    callq __extendxftf2
+; X64-SSE-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; X64-SSE-NEXT:    addq $24, %rsp
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPExtF80_F128:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    subq $24, %rsp
+; X64-AVX-NEXT:    fldt {{.*}}(%rip)
+; X64-AVX-NEXT:    fstpt (%rsp)
+; X64-AVX-NEXT:    callq __extendxftf2
+; X64-AVX-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; X64-AVX-NEXT:    addq $24, %rsp
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPExtF80_F128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    fldt vf80
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    calll __extendxftf2
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, vf128+8
+; X86-NEXT:    movl %edx, vf128+12
+; X86-NEXT:    movl %eax, vf128
+; X86-NEXT:    movl %ecx, vf128+4
+; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+entry:
+  %0 = load x86_fp80, x86_fp80* @vf80, align 8
+  %conv = call fp128 @llvm.experimental.constrained.fpext.f128.f80(x86_fp80 %0, metadata !"fpexcept.strict") #0
+  store fp128 %conv, fp128* @vf128, align 16
+  ret void
+}
+
+define void @TestFPTruncF128_F32() nounwind strictfp {
+; X64-SSE-LABEL: TestFPTruncF128_F32:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movaps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    callq __trunctfsf2
+; X64-SSE-NEXT:    movss %xmm0, {{.*}}(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPTruncF128_F32:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; X64-AVX-NEXT:    callq __trunctfsf2
+; X64-AVX-NEXT:    vmovss %xmm0, {{.*}}(%rip)
+; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPTruncF128_F32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl vf128+12
+; X86-NEXT:    pushl vf128+8
+; X86-NEXT:    pushl vf128+4
+; X86-NEXT:    pushl vf128
+; X86-NEXT:    calll __trunctfsf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    fstps vf32
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = call float @llvm.experimental.constrained.fptrunc.f32.f128(fp128 %0, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  store float %conv, float* @vf32, align 4
+  ret void
+}
+
+define void @TestFPTruncF128_F64() nounwind strictfp {
+; X64-SSE-LABEL: TestFPTruncF128_F64:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movaps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    callq __trunctfdf2
+; X64-SSE-NEXT:    movsd %xmm0, {{.*}}(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPTruncF128_F64:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; X64-AVX-NEXT:    callq __trunctfdf2
+; X64-AVX-NEXT:    vmovsd %xmm0, {{.*}}(%rip)
+; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPTruncF128_F64:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl vf128+12
+; X86-NEXT:    pushl vf128+8
+; X86-NEXT:    pushl vf128+4
+; X86-NEXT:    pushl vf128
+; X86-NEXT:    calll __trunctfdf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    fstpl vf64
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = call double @llvm.experimental.constrained.fptrunc.f64.f128(fp128 %0, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  store double %conv, double* @vf64, align 8
+  ret void
+}
+
+define void @TestFPTruncF128_F80() nounwind strictfp {
+; X64-SSE-LABEL: TestFPTruncF128_F80:
+; X64-SSE:       # %bb.0: # %entry
+; X64-SSE-NEXT:    pushq %rax
+; X64-SSE-NEXT:    movaps {{.*}}(%rip), %xmm0
+; X64-SSE-NEXT:    callq __trunctfxf2
+; X64-SSE-NEXT:    fstpt {{.*}}(%rip)
+; X64-SSE-NEXT:    popq %rax
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: TestFPTruncF128_F80:
+; X64-AVX:       # %bb.0: # %entry
+; X64-AVX-NEXT:    pushq %rax
+; X64-AVX-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; X64-AVX-NEXT:    callq __trunctfxf2
+; X64-AVX-NEXT:    fstpt {{.*}}(%rip)
+; X64-AVX-NEXT:    popq %rax
+; X64-AVX-NEXT:    retq
+;
+; X86-LABEL: TestFPTruncF128_F80:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl vf128+12
+; X86-NEXT:    pushl vf128+8
+; X86-NEXT:    pushl vf128+4
+; X86-NEXT:    pushl vf128
+; X86-NEXT:    calll __trunctfxf2
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    fstpt vf80
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %0 = load fp128, fp128* @vf128, align 16
+  %conv = call x86_fp80 @llvm.experimental.constrained.fptrunc.f80.f128(fp128 %0, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  store x86_fp80 %conv, x86_fp80* @vf80, align 8
+  ret void
+}
+
+define i8 @fptosi_i8(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptosi_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfsi
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptosi_i8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixtfsi
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i8 @llvm.experimental.constrained.fptosi.i8.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i8 %conv
+}
+
+define i16 @fptosi_i16(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptosi_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfsi
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptosi_i16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixtfsi
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i16 @llvm.experimental.constrained.fptosi.i16.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i16 %conv
+}
+
+define i32 @fptosi_i32(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptosi_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfsi
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptosi_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixtfsi
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i32 %conv
+}
+
+define i64 @fptosi_i64(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptosi_i64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfdi
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptosi_i64:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixtfdi
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i64 %conv
+}
+
+define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptosi_i128:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfti
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptosi_i128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __fixtfti
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+entry:
+  %conv = call i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i128 %conv
+}
+
+define i8 @fptoui_i8(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptoui_i8:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfsi
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptoui_i8:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixunstfsi
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i8 @llvm.experimental.constrained.fptoui.i8.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i8 %conv
+}
+
+define i16 @fptoui_i16(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptoui_i16:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixtfsi
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptoui_i16:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixunstfsi
+; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i16 @llvm.experimental.constrained.fptoui.i16.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i16 %conv
+}
+
+define i32 @fptoui_i32(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptoui_i32:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixunstfsi
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptoui_i32:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixunstfsi
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i32 %conv
+}
+
+define i64 @fptoui_i64(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptoui_i64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixunstfdi
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptoui_i64:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll __fixunstfdi
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %conv = call i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i64 %conv
+}
+
+define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
+; X64-LABEL: fptoui_i128:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq __fixunstfti
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: fptoui_i128:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __fixunstfti
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
+entry:
+  %conv = call i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i128 %conv
+}
+
+attributes #0 = { strictfp }
+
+declare float @llvm.experimental.constrained.fptrunc.f32.f128(fp128, metadata, metadata)
+declare double @llvm.experimental.constrained.fptrunc.f64.f128(fp128, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.fptrunc.f80.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.fpext.f128.f32(float, metadata)
+declare fp128 @llvm.experimental.constrained.fpext.f128.f64(double, metadata)
+declare fp128 @llvm.experimental.constrained.fpext.f128.f80(x86_fp80, metadata)
+declare i8 @llvm.experimental.constrained.fptosi.i8.f128(fp128, metadata)
+declare i16 @llvm.experimental.constrained.fptosi.i16.f128(fp128, metadata)
+declare i32 @llvm.experimental.constrained.fptosi.i32.f128(fp128, metadata)
+declare i64 @llvm.experimental.constrained.fptosi.i64.f128(fp128, metadata)
+declare i128 @llvm.experimental.constrained.fptosi.i128.f128(fp128, metadata)
+declare i8 @llvm.experimental.constrained.fptoui.i8.f128(fp128, metadata)
+declare i16 @llvm.experimental.constrained.fptoui.i16.f128(fp128, metadata)
+declare i32 @llvm.experimental.constrained.fptoui.i32.f128(fp128, metadata)
+declare i64 @llvm.experimental.constrained.fptoui.i64.f128(fp128, metadata)
+declare i128 @llvm.experimental.constrained.fptoui.i128.f128(fp128, metadata)
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index c47b92f04e49f..d99ce45f050ca 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -496,9 +496,8 @@ define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result,
 ; AVX-NEXT:    testl %ebp, %ebp
 ; AVX-NEXT:    jle .LBB10_1
 ; AVX-NEXT:  # %bb.2: # %if.then
-; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm1
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm2
 ; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT:    vmovaps %xmm1, %xmm2
 ; AVX-NEXT:    jmp .LBB10_3
 ; AVX-NEXT:  .LBB10_1:
 ; AVX-NEXT:    vmovaps (%rsp), %xmm2 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a37adcb107c39..05b129ceeeaad 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1,10 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android \
 ; RUN:     -enable-legalize-types-checking \
 ; RUN:     -disable-strictnode-mutation | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu \
 ; RUN:     -enable-legalize-types-checking \
 ; RUN:     -disable-strictnode-mutation | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=i686-linux-gnu -mattr=+sse2 \
+; RUN:     -enable-legalize-types-checking \
+; RUN:     -disable-strictnode-mutation | FileCheck %s --check-prefix=X86
 
 ; Check all soft floating point library function calls.
 
@@ -15,6 +18,39 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq __addtf3
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: add:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __addtf3
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %add = call fp128 @llvm.experimental.constrained.fadd.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %add
@@ -27,6 +63,39 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq __subtf3
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: sub:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __subtf3
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %sub = call fp128 @llvm.experimental.constrained.fsub.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %sub
@@ -39,6 +108,39 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq __multf3
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: mul:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __multf3
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %mul = call fp128 @llvm.experimental.constrained.fmul.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %mul
@@ -51,6 +153,39 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq __divtf3
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: div:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __divtf3
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %div = call fp128 @llvm.experimental.constrained.fdiv.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -63,6 +198,43 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; CHECK-NEXT:    callq fmal
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: fma:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll fmal
+; X86-NEXT:    addl $60, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %fma = call fp128 @llvm.experimental.constrained.fma.f128(fp128 %x, fp128 %y,  fp128 %z, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %fma
@@ -75,6 +247,39 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq fmodl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: frem:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll fmodl
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %div = call fp128 @llvm.experimental.constrained.frem.f128(fp128 %x, fp128 %y,  metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %div
@@ -87,6 +292,35 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq ceill
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: ceil:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll ceill
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %ceil = call fp128 @llvm.experimental.constrained.ceil.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %ceil
@@ -99,6 +333,35 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq cosl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: cos:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll cosl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %cos = call fp128 @llvm.experimental.constrained.cos.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %cos
@@ -111,6 +374,35 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq expl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: exp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll expl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %exp = call fp128 @llvm.experimental.constrained.exp.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %exp
@@ -123,6 +415,35 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq exp2l
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: exp2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll exp2l
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %exp2 = call fp128 @llvm.experimental.constrained.exp2.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %exp2
@@ -135,6 +456,35 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq floorl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: floor:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll floorl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %floor = call fp128 @llvm.experimental.constrained.floor.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %floor
@@ -147,6 +497,35 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq logl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: log:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll logl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %log = call fp128 @llvm.experimental.constrained.log.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %log
@@ -159,6 +538,35 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq log10l
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: log10:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll log10l
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %log10 = call fp128 @llvm.experimental.constrained.log10.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %log10
@@ -171,6 +579,35 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq log2l
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: log2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll log2l
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %log2 = call fp128 @llvm.experimental.constrained.log2.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %log2
@@ -183,6 +620,39 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq fmaxl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: maxnum:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll fmaxl
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %maxnum = call fp128 @llvm.experimental.constrained.maxnum.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %maxnum
@@ -195,6 +665,39 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq fminl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: minnum:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll fminl
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %minnum = call fp128 @llvm.experimental.constrained.minnum.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %minnum
@@ -207,6 +710,35 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq nearbyintl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: nearbyint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll nearbyintl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %nearbyint = call fp128 @llvm.experimental.constrained.nearbyint.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %nearbyint
@@ -219,6 +751,39 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq powl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: pow:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll powl
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %pow = call fp128 @llvm.experimental.constrained.pow.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %pow
@@ -231,6 +796,36 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; CHECK-NEXT:    callq __powitf2
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: powi:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll __powitf2
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %powi = call fp128 @llvm.experimental.constrained.powi.f128(fp128 %x, i32 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %powi
@@ -243,6 +838,35 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq rintl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: rint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll rintl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %rint = call fp128 @llvm.experimental.constrained.rint.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %rint
@@ -255,6 +879,35 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq roundl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: round:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll roundl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %round = call fp128 @llvm.experimental.constrained.round.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %round
@@ -267,6 +920,35 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq sinl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: sin:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll sinl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %sin = call fp128 @llvm.experimental.constrained.sin.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %sin
@@ -279,6 +961,35 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq sqrtl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: sqrt:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll sqrtl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %sqrt = call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %sqrt
@@ -291,11 +1002,132 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; CHECK-NEXT:    callq truncl
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
+;
+; X86-LABEL: trunc:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll truncl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, 8(%esi)
+; X86-NEXT:    movl %edx, 12(%esi)
+; X86-NEXT:    movl %eax, (%esi)
+; X86-NEXT:    movl %ecx, 4(%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl $4
 entry:
   %trunc = call fp128 @llvm.experimental.constrained.trunc.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   ret fp128 %trunc
 }
 
+define i32 @lrint(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: lrint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq lrintl
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+;
+; X86-LABEL: lrint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll lrintl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret i32 %rint
+}
+
+define i64 @llrint(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: llrint:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq llrintl
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+;
+; X86-LABEL: llrint:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll llrintl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret i64 %rint
+}
+
+define i32 @lround(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: lround:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq lroundl
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+;
+; X86-LABEL: lround:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll lroundl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i32 %round
+}
+
+define i64 @llround(fp128 %x) nounwind strictfp {
+; CHECK-LABEL: llround:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq llroundl
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+;
+; X86-LABEL: llround:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    calll llroundl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    retl
+entry:
+  %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
+  ret i64 %round
+}
+
 attributes #0 = { strictfp }
 
 declare fp128 @llvm.experimental.constrained.fadd.f128(fp128, fp128, metadata, metadata)
@@ -322,3 +1154,7 @@ declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata, metadat
 declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lrint.i32.f128(fp128, metadata, metadata)
+declare i64 @llvm.experimental.constrained.llrint.i64.f128(fp128, metadata, metadata)
+declare i32 @llvm.experimental.constrained.lround.i32.f128(fp128, metadata)
+declare i64 @llvm.experimental.constrained.llround.i64.f128(fp128, metadata)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
index 1fc5d0196190d..e4fcf54e6950a 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-scalar.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse -O3 | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-sse -O3 | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -O3 -disable-strictnode-mutation | FileCheck %s --check-prefixes=CHECK,X64
 
 declare x86_fp80 @llvm.experimental.constrained.fadd.x86_fp80(x86_fp80, x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.fsub.x86_fp80(x86_fp80, x86_fp80, metadata, metadata)
@@ -9,8 +9,8 @@ declare x86_fp80 @llvm.experimental.constrained.fdiv.x86_fp80(x86_fp80, x86_fp80
 declare x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float, metadata)
 declare x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double, metadata)
 declare x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80, metadata, metadata)
-declare float @llvm.experimental.constrained.fptrunc.x86_fp80.f32(x86_fp80, metadata, metadata)
-declare double @llvm.experimental.constrained.fptrunc.x86_fp80.f64(x86_fp80, metadata, metadata)
+declare float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80, metadata, metadata)
+declare double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80, metadata, metadata)
 
 define x86_fp80 @fadd_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
 ; X86-LABEL: fadd_fp80:
@@ -92,129 +92,102 @@ define x86_fp80 @fdiv_fp80(x86_fp80 %a, x86_fp80 %b) nounwind strictfp {
   ret x86_fp80 %ret
 }
 
-define void @fpext_f32_to_fp80(float* %val, x86_fp80* %ret) nounwind strictfp {
+define x86_fp80 @fpext_f32_to_fp80(float %a) nounwind strictfp {
 ; X86-LABEL: fpext_f32_to_fp80:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    flds (%ecx)
-; X86-NEXT:    fstpt (%eax)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f32_to_fp80:
 ; X64:       # %bb.0:
-; X64-NEXT:    flds (%rdi)
-; X64-NEXT:    fstpt (%rsi)
+; X64-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    flds -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
-  %1 = load float, float* %val, align 4
-  %res = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float %1,
+  %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f32(float %a,
                                                                          metadata !"fpexcept.strict") #0
-  store x86_fp80 %res, x86_fp80* %ret, align 16
-  ret void
+  ret x86_fp80 %ret
 }
 
-define void @fpext_f64_to_fp80(double* %val, x86_fp80* %ret) nounwind strictfp {
+define x86_fp80 @fpext_f64_to_fp80(double %a) nounwind strictfp {
 ; X86-LABEL: fpext_f64_to_fp80:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    fldl (%ecx)
-; X86-NEXT:    fstpt (%eax)
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fpext_f64_to_fp80:
 ; X64:       # %bb.0:
-; X64-NEXT:    fldl (%rdi)
-; X64-NEXT:    fstpt (%rsi)
+; X64-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    fldl -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    retq
-  %1 = load double, double* %val, align 8
-  %res = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double %1,
+  %ret = call x86_fp80 @llvm.experimental.constrained.fpext.x86_fp80.f64(double %a,
                                                                          metadata !"fpexcept.strict") #0
-  store x86_fp80 %res, x86_fp80* %ret, align 16
-  ret void
+  ret x86_fp80 %ret
 }
 
-define void @fptrunc_fp80_to_f32(x86_fp80* %val, float *%ret) nounwind strictfp {
+define float @fptrunc_fp80_to_f32(x86_fp80 %a) nounwind strictfp {
 ; X86-LABEL: fptrunc_fp80_to_f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    fldt (%ecx)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fstps (%esp)
 ; X86-NEXT:    flds (%esp)
-; X86-NEXT:    fstps (%eax)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptrunc_fp80_to_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    fldt (%rdi)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fstps -{{[0-9]+}}(%rsp)
-; X64-NEXT:    flds -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fstps (%rsi)
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    retq
-  %1 = load x86_fp80, x86_fp80* %val, align 16
-  %res = call float @llvm.experimental.constrained.fptrunc.x86_fp80.f32(x86_fp80 %1,
+  %ret = call float @llvm.experimental.constrained.fptrunc.f32.x86_fp80(x86_fp80 %a,
                                                                         metadata !"round.dynamic",
                                                                         metadata !"fpexcept.strict") #0
-  store float %res, float* %ret, align 4
-  ret void
+  ret float %ret
 }
 
-define void @fptrunc_fp80_to_f64(x86_fp80* %val, double* %ret) nounwind strictfp {
+define double @fptrunc_fp80_to_f64(x86_fp80 %a) nounwind strictfp {
 ; X86-LABEL: fptrunc_fp80_to_f64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    andl $-8, %esp
 ; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    fldt (%ecx)
+; X86-NEXT:    fldt 8(%ebp)
 ; X86-NEXT:    fstpl (%esp)
 ; X86-NEXT:    fldl (%esp)
-; X86-NEXT:    fstpl (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fptrunc_fp80_to_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    fldt (%rdi)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fstpl -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fldl -{{[0-9]+}}(%rsp)
-; X64-NEXT:    fstpl (%rsi)
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    retq
-  %1 = load x86_fp80, x86_fp80* %val, align 16
-  %res = call double @llvm.experimental.constrained.fptrunc.x86_fp80.f64(x86_fp80 %1,
+  %ret = call double @llvm.experimental.constrained.fptrunc.f64.x86_fp80(x86_fp80 %a,
                                                                          metadata !"round.dynamic",
                                                                          metadata !"fpexcept.strict") #0
-  store double %res, double* %ret, align 8
-  ret void
+  ret double %ret
 }
 
-define void @fsqrt_fp80(x86_fp80* %a) nounwind strictfp {
+define x86_fp80 @fsqrt_fp80(x86_fp80 %a) nounwind strictfp {
 ; X86-LABEL: fsqrt_fp80:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    fldt (%eax)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
 ; X86-NEXT:    fsqrt
-; X86-NEXT:    fstpt (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fsqrt_fp80:
 ; X64:       # %bb.0:
-; X64-NEXT:    fldt (%rdi)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
 ; X64-NEXT:    fsqrt
-; X64-NEXT:    fstpt (%rdi)
 ; X64-NEXT:    retq
-  %1 = load x86_fp80, x86_fp80* %a, align 16
-  %res = call x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80 %1,
+  %ret = call x86_fp80 @llvm.experimental.constrained.sqrt.x86_fp80(x86_fp80 %a,
                                                                     metadata !"round.dynamic",
                                                                     metadata !"fpexcept.strict") #0
-  store x86_fp80 %res, x86_fp80* %a, align 16
-  ret void
+  ret x86_fp80 %ret
 }
 
 attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index a655c5804e1bc..009f2420575f9 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -279,8 +279,7 @@ define i64 @var_shift_i64(i64 %x, i64 %y, i64 %z) nounwind {
 ; X86-SLOW-NEXT:    orl %edi, %edx
 ; X86-SLOW-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-SLOW-NEXT:  .LBB4_2:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-SLOW-NEXT:    movl %ebx, %ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edx
 ; X86-SLOW-NEXT:    movb %bl, %ah
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 448c21d93ac8e..92118100bba84 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -289,12 +289,12 @@ define <2 x double> @trunc_signed_v2f64(<2 x double> %x) #0 {
 ; SSE2-LABEL: trunc_signed_v2f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rax
-; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rcx
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rcx, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_signed_v2f64:
@@ -315,20 +315,20 @@ define <4 x double> @trunc_signed_v4f64(<4 x double> %x) #0 {
 ; SSE2-LABEL: trunc_signed_v4f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
-; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rcx
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rdx
-; SSE2-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE2-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE2-NEXT:    cvttsd2si %xmm0, %rsi
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rdx, %xmm0
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rsi, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rcx, %xmm2
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_signed_v4f64:
diff --git a/llvm/test/CodeGen/X86/i128-mul.ll b/llvm/test/CodeGen/X86/i128-mul.ll
index 9f9636361a50a..e40f10a67dd1b 100644
--- a/llvm/test/CodeGen/X86/i128-mul.ll
+++ b/llvm/test/CodeGen/X86/i128-mul.ll
@@ -88,9 +88,8 @@ define i64 @mul1(i64 %n, i64* nocapture %z, i64* nocapture %x, i64 %y) nounwind
 ; X86-NEXT:    movl 4(%eax,%ebp,8), %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    mull %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/implicit-null-checks.mir b/llvm/test/CodeGen/X86/implicit-null-checks.mir
index e6147f56ed777..e1ac01a829730 100644
--- a/llvm/test/CodeGen/X86/implicit-null-checks.mir
+++ b/llvm/test/CodeGen/X86/implicit-null-checks.mir
@@ -828,6 +828,7 @@ name:            inc_store_with_dep
 # CHECK-NEXT:  $noreg = FAULTING_OP 3, %bb.2, {{[0-9]+}}, $rdi, 1, $noreg, 16, $noreg, $esi
 # CHECK-NEXT: JMP_1 %bb.1
 # CHECK: bb.1.not_null
+# CHECK-NOT: liveins: {{.*}} $eflags
 
 alignment:       16
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
index 8d43a1b73234c..980956bdaa88c 100644
--- a/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/llvm/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -321,7 +321,7 @@ define i32 @test_zext_cmp11(double %a, double %b) "no-nans-fp-math"="true" {
 ;
 ; ALL-LABEL: test_zext_cmp11:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; ALL-NEXT:    vucomisd %xmm2, %xmm0
 ; ALL-NEXT:    sete %al
 ; ALL-NEXT:    vucomisd %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
new file mode 100644
index 0000000000000..27cd7b98fa60c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
@@ -0,0 +1,128 @@
+; RUN: llc < %s | FileCheck --check-prefixes=CHECK %s
+; RUN: llc -O0 < %s | FileCheck --check-prefixes=CHECK %s
+
+; Source to regenerate:
+; struct Foo {
+;   int * __ptr32 p32;
+;   int * __ptr64 p64;
+;   __attribute__((address_space(9))) int *p_other;
+; };
+; void use_foo(Foo *f);
+; void test_sign_ext(Foo *f, int * __ptr32 __sptr i) {
+;   f->p64 = i;
+;   use_foo(f);
+; }
+; void test_zero_ext(Foo *f, int * __ptr32 __uptr i) {
+;   f->p64 = i;
+;   use_foo(f);
+; }
+; void test_trunc(Foo *f, int * __ptr64 i) {
+;   f->p32 = i;
+;   use_foo(f);
+; }
+; void test_noop1(Foo *f, int * __ptr32 i) {
+;   f->p32 = i;
+;   use_foo(f);
+; }
+; void test_noop2(Foo *f, int * __ptr64 i) {
+;   f->p64 = i;
+;   use_foo(f);
+; }
+; void test_null_arg(Foo *f, int * __ptr32 i) {
+;   test_noop1(f, 0);
+; }
+; void test_unrecognized(Foo *f, __attribute__((address_space(14))) int *i) {
+;   f->p32 = (int * __ptr32)i;
+;   use_foo(f);
+; }
+;
+; $ clang -cc1 -triple x86_64-windows-msvc -fms-extensions -O2 -S t.cpp
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc"
+
+%struct.Foo = type { i32 addrspace(270)*, i32*, i32 addrspace(9)* }
+declare dso_local void @use_foo(%struct.Foo*)
+
+define dso_local void @test_sign_ext(%struct.Foo* %f, i32 addrspace(270)* %i) {
+; CHECK-LABEL: test_sign_ext
+; CHECK:       movslq %edx, %rax
+entry:
+  %0 = addrspacecast i32 addrspace(270)* %i to i32*
+  %p64 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 1
+  store i32* %0, i32** %p64, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+define dso_local void @test_zero_ext(%struct.Foo* %f, i32 addrspace(271)* %i) {
+; CHECK-LABEL: test_zero_ext
+; CHECK:       movl %edx, %eax
+entry:
+  %0 = addrspacecast i32 addrspace(271)* %i to i32*
+  %p64 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 1
+  store i32* %0, i32** %p64, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+define dso_local void @test_trunc(%struct.Foo* %f, i32* %i) {
+; CHECK-LABEL: test_trunc
+; CHECK:       movl %edx, (%rcx)
+entry:
+  %0 = addrspacecast i32* %i to i32 addrspace(270)*
+  %p32 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 0
+  store i32 addrspace(270)* %0, i32 addrspace(270)** %p32, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+define dso_local void @test_noop1(%struct.Foo* %f, i32 addrspace(270)* %i) {
+; CHECK-LABEL: test_noop1
+; CHECK:       movl %edx, (%rcx)
+entry:
+  %p32 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 0
+  store i32 addrspace(270)* %i, i32 addrspace(270)** %p32, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+define dso_local void @test_noop2(%struct.Foo* %f, i32* %i) {
+; CHECK-LABEL: test_noop2
+; CHECK:       movq %rdx, 8(%rcx)
+entry:
+  %p64 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 1
+  store i32* %i, i32** %p64, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+; Test that null can be passed as a 32-bit pointer.
+define dso_local void @test_null_arg(%struct.Foo* %f) {
+entry:
+  call void @test_noop1(%struct.Foo* %f, i32 addrspace(270)* null)
+  ret void
+}
+
+; Test casts between unrecognized address spaces.
+define void @test_unrecognized(%struct.Foo* %f, i32 addrspace(14)* %i) {
+; CHECK-LABEL: test_unrecognized
+; CHECK:       movl %edx, (%rcx)
+entry:
+  %0 = addrspacecast i32 addrspace(14)* %i to i32 addrspace(270)*
+  %p32 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 0
+  store i32 addrspace(270)* %0, i32 addrspace(270)** %p32, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
+
+define void @test_unrecognized2(%struct.Foo* %f, i32 addrspace(271)* %i) {
+; CHECK-LABEL: test_unrecognized2
+; CHECK:       movl %edx, %eax
+entry:
+  %0 = addrspacecast i32 addrspace(271)* %i to i32 addrspace(9)*
+  %p32 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i64 0, i32 2
+  store i32 addrspace(9)* %0, i32 addrspace(9)** %p32, align 8
+  tail call void @use_foo(%struct.Foo* %f)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/mmx-reg-usage.ll b/llvm/test/CodeGen/X86/mmx-reg-usage.ll
deleted file mode 100644
index a8d88c2e9f8e2..0000000000000
--- a/llvm/test/CodeGen/X86/mmx-reg-usage.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -march=x86-64 -mattr=+mmx -stop-after finalize-isel -o - %s | FileCheck %s
-; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
-
-define x86_mmx @mxcsr_usage(<4 x float> %a0) {
-  %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
-  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
-  %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
-  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
-  %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
-  ret x86_mmx %5
-}
-
-declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
-declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
-declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
-declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
-
-; CHECK: MMX_CVTPS2PIirr %{{[0-9]}}, implicit $mxcsr
-; CHECK: MMX_CVTPI2PSirr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
-; CHECK: MMX_CVTTPS2PIirr killed %{{[0-9]}}, implicit $mxcsr
-; CHECK: MMX_CVTPI2PDirr killed %{{[0-9]$}}
-; CHECK: MMX_CVTPD2PIirr killed %{{[0-9]}}, implicit $mxcsr
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 40f6b09288e05..a5050467ac1af 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -153,9 +153,8 @@ define void @test_512(i512* %a, i512* %b, i512* %out) nounwind {
 ; X32-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    adcl $0, %edx
 ; X32-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X32-NEXT:    movl %esi, %ecx
-; X32-NEXT:    movl 8(%esi), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 8(%ecx), %ebx
 ; X32-NEXT:    movl %ebx, %eax
 ; X32-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X32-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
new file mode 100644
index 0000000000000..3bae883a8d9de
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mxcsr-reg-usage.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=x86-64 -mattr=+mmx,+fma,+f16c,+avx512f -stop-after finalize-isel -o - %s | FileCheck %s
+; This test ensures that the MXCSR is implicitly used by MMX FP instructions.
+
+define x86_mmx @mxcsr_mmx(<4 x float> %a0) {
+; CHECK: MMX_CVTPS2PIirr %{{[0-9]}}, implicit $mxcsr
+; CHECK: MMX_CVTPI2PSirr %{{[0-9]}}, killed %{{[0-9]}}, implicit $mxcsr
+; CHECK: MMX_CVTTPS2PIirr killed %{{[0-9]}}, implicit $mxcsr
+; CHECK: MMX_CVTPI2PDirr killed %{{[0-9]$}}
+; CHECK: MMX_CVTPD2PIirr killed %{{[0-9]}}, implicit $mxcsr
+  %1 = call x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float> %a0)
+  %2 = call <4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float> %a0, x86_mmx %1)
+  %3 = call x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float> %2)
+  %4 = call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %3)
+  %5 = call x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double> %4)
+  ret x86_mmx %5
+}
+
+define half @mxcsr_f16c(float %a) {
+; CHECK: VCVTPS2PH{{.*}}mxcsr
+; CHECK: VCVTPH2PS{{.*}}mxcsr
+  %res = fptrunc float %a to half
+  ret half %res
+}
+
+define <4 x float> @mxcsr_fma_ss(<4 x float> %a, <4 x float> %b) {
+; CHECK: VFMADD{{.*}}mxcsr
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %b, <4 x float> %a, <4 x float>
+%a)
+  ret <4 x float> %res
+}
+
+define <4 x float> @mxcsr_fma_ps(<4 x float> %a, <4 x float> %b) {
+; CHECK: VFMADD{{.*}}mxcsr
+  %res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %b, <4 x float> %a, <4 x float>
+%a)
+  ret <4 x float> %res
+}
+
+define <8 x double> @mxcsr_fma_sae(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
+; CHECK: VFMADD{{.*}}mxcsr
+  %res = call <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 -1, i32 10)
+  ret <8 x double> %res
+}
+
+declare x86_mmx @llvm.x86.sse.cvtps2pi(<4 x float>)
+declare<4 x float> @llvm.x86.sse.cvtpi2ps(<4 x float>, x86_mmx)
+declare x86_mmx @llvm.x86.sse.cvttps2pi(<4 x float>)
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
+declare x86_mmx @llvm.x86.sse.cvtpd2pi(<2 x double>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>)
+declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x double> @llvm.x86.avx512.mask.vfmadd.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
diff --git a/llvm/test/CodeGen/X86/pr37916.ll b/llvm/test/CodeGen/X86/pr37916.ll
index 2da9413a9a0cf..484104da9ff47 100644
--- a/llvm/test/CodeGen/X86/pr37916.ll
+++ b/llvm/test/CodeGen/X86/pr37916.ll
@@ -7,7 +7,6 @@
 define void @fn1() local_unnamed_addr {
 ; CHECK-LABEL: fn1:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %if.end
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movl a+4, %eax
diff --git a/llvm/test/CodeGen/X86/pr42905.ll b/llvm/test/CodeGen/X86/pr42905.ll
index bb51aced225c6..310a173f824e9 100644
--- a/llvm/test/CodeGen/X86/pr42905.ll
+++ b/llvm/test/CodeGen/X86/pr42905.ll
@@ -11,7 +11,7 @@ define <4 x double> @autogen_SD30452(i1 %L230) {
 ; CHECK-NEXT:    movq %xmm2, %rax
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    cvtsi2sd %rax, %xmm2
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; CHECK-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr44140.ll b/llvm/test/CodeGen/X86/pr44140.ll
index 9916252e6c499..941f45d2d99a2 100644
--- a/llvm/test/CodeGen/X86/pr44140.ll
+++ b/llvm/test/CodeGen/X86/pr44140.ll
@@ -10,7 +10,6 @@ define win64cc void @opaque() {
 
 ; We need xmm6 to be live from the loop header across all iterations of the loop.
 ; We shouldn't clobber ymm6 inside the loop.
-; FIXME: We currently clobber ymm6
 define i32 @main() {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %start
@@ -23,7 +22,7 @@ define i32 @main() {
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1
-; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm6
+; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm7
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm2
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm3
 ; CHECK-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
@@ -31,10 +30,10 @@ define i32 @main() {
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm1
 ; CHECK-NEXT:    vmovups %ymm3, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups %ymm3, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovups %ymm6, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovups %ymm7, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm5
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index 9238ab0bf89f7..92708d33924f0 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -29,8 +29,8 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 56
-; CHECK-NEXT:    subq $536, %rsp ## imm = 0x218
-; CHECK-NEXT:    .cfi_def_cfa_offset 592
+; CHECK-NEXT:    subq $552, %rsp ## imm = 0x228
+; CHECK-NEXT:    .cfi_def_cfa_offset 608
 ; CHECK-NEXT:    .cfi_offset %rbx, -56
 ; CHECK-NEXT:    .cfi_offset %r12, -48
 ; CHECK-NEXT:    .cfi_offset %r13, -40
@@ -54,7 +54,7 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je LBB0_55
 ; CHECK-NEXT:  LBB0_4: ## %cleanup
-; CHECK-NEXT:    addq $536, %rsp ## imm = 0x218
+; CHECK-NEXT:    addq $552, %rsp ## imm = 0x228
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    popq %r12
 ; CHECK-NEXT:    popq %r13
@@ -68,7 +68,7 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    je LBB0_55
 ; CHECK-NEXT:  ## %bb.6: ## %SyTime.exit2720
 ; CHECK-NEXT:    movq %rdx, %rbx
-; CHECK-NEXT:    movq %rdi, %rbp
+; CHECK-NEXT:    movq %rdi, %r14
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 ; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    cmpq %rax, %rcx
@@ -78,10 +78,10 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    movl $32, %esi
 ; CHECK-NEXT:    callq _memset
 ; CHECK-NEXT:  LBB0_8: ## %while.body.preheader
-; CHECK-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    imulq $1040, %rbx, %rax ## imm = 0x410
 ; CHECK-NEXT:    movq _syBuf@{{.*}}(%rip), %rcx
-; CHECK-NEXT:    leaq 8(%rcx,%rax), %rbx
+; CHECK-NEXT:    leaq 8(%rcx,%rax), %rax
+; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    movl $1, %r15d
 ; CHECK-NEXT:    movq _syCTRO@{{.*}}(%rip), %rax
 ; CHECK-NEXT:    movb $1, %cl
@@ -92,69 +92,70 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    testb %cl, %cl
 ; CHECK-NEXT:    jne LBB0_9
 ; CHECK-NEXT:  ## %bb.10: ## %do.end
-; CHECK-NEXT:    xorl %r14d, %r14d
-; CHECK-NEXT:    testb %r14b, %r14b
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    testb %bpl, %bpl
 ; CHECK-NEXT:    jne LBB0_11
 ; CHECK-NEXT:  ## %bb.12: ## %while.body200.preheader
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rsi
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    leaq {{.*}}(%rip), %r13
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    xorl %r12d, %r12d
+; CHECK-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-NEXT:    jmp LBB0_13
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_20: ## %sw.bb256
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl %r14d, %r13d
+; CHECK-NEXT:    movl %ebp, %r12d
 ; CHECK-NEXT:  LBB0_21: ## %while.cond197.backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    decl %r15d
 ; CHECK-NEXT:    testl %r15d, %r15d
-; CHECK-NEXT:    movl %r13d, %r14d
+; CHECK-NEXT:    movl %r12d, %ebp
 ; CHECK-NEXT:    jle LBB0_22
 ; CHECK-NEXT:  LBB0_13: ## %while.body200
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
 ; CHECK-NEXT:    ## Child Loop BB0_30 Depth 2
 ; CHECK-NEXT:    ## Child Loop BB0_38 Depth 2
-; CHECK-NEXT:    leal -268(%r14), %eax
+; CHECK-NEXT:    leal -268(%rbp), %eax
 ; CHECK-NEXT:    cmpl $105, %eax
 ; CHECK-NEXT:    ja LBB0_14
 ; CHECK-NEXT:  ## %bb.56: ## %while.body200
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movslq (%rdi,%rax,4), %rax
-; CHECK-NEXT:    addq %rdi, %rax
+; CHECK-NEXT:    movslq (%r13,%rax,4), %rax
+; CHECK-NEXT:    addq %r13, %rax
 ; CHECK-NEXT:    jmpq *%rax
 ; CHECK-NEXT:  LBB0_44: ## %while.cond1037.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    movl %r14d, %r13d
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movl %ebp, %r12d
 ; CHECK-NEXT:    jne LBB0_21
 ; CHECK-NEXT:    jmp LBB0_55
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_14: ## %while.body200
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    leal 1(%r14), %eax
+; CHECK-NEXT:    leal 1(%rbp), %eax
 ; CHECK-NEXT:    cmpl $21, %eax
 ; CHECK-NEXT:    ja LBB0_20
 ; CHECK-NEXT:  ## %bb.15: ## %while.body200
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $-1, %r13d
-; CHECK-NEXT:    movslq (%rsi,%rax,4), %rax
-; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    movl $-1, %r12d
+; CHECK-NEXT:    leaq {{.*}}(%rip), %rcx
+; CHECK-NEXT:    movslq (%rcx,%rax,4), %rax
+; CHECK-NEXT:    addq %rcx, %rax
 ; CHECK-NEXT:    jmpq *%rax
 ; CHECK-NEXT:  LBB0_18: ## %while.cond201.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $1, %r13d
+; CHECK-NEXT:    movl $1, %r12d
 ; CHECK-NEXT:    jmp LBB0_21
 ; CHECK-NEXT:  LBB0_26: ## %sw.bb474
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    ## implicit-def: $r12
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    ## implicit-def: $r14
 ; CHECK-NEXT:    jne LBB0_34
 ; CHECK-NEXT:  ## %bb.27: ## %do.body479.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    ## implicit-def: $r12
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    ## implicit-def: $r14
 ; CHECK-NEXT:    jne LBB0_34
 ; CHECK-NEXT:  ## %bb.28: ## %land.rhs485.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
@@ -165,8 +166,8 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_32: ## %do.body479.backedge
 ; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
-; CHECK-NEXT:    leaq 1(%r12), %rax
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    leaq 1(%r14), %rax
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    je LBB0_33
 ; CHECK-NEXT:  ## %bb.29: ## %land.rhs485
 ; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
@@ -175,15 +176,14 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  LBB0_30: ## %cond.true.i.i2780
 ; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movq %rax, %r12
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    movq %rax, %r14
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_32
 ; CHECK-NEXT:  ## %bb.31: ## %lor.rhs500
 ; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
 ; CHECK-NEXT:    movl $256, %esi ## imm = 0x100
 ; CHECK-NEXT:    callq ___maskrune
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_32
 ; CHECK-NEXT:    jmp LBB0_34
 ; CHECK-NEXT:  LBB0_45: ## %sw.bb1134
@@ -193,23 +193,23 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    cmpq %rax, %rcx
 ; CHECK-NEXT:    jb LBB0_55
 ; CHECK-NEXT:  ## %bb.46: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    movl $268, %r13d ## imm = 0x10C
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    movl $268, %r12d ## imm = 0x10C
 ; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_19: ## %sw.bb243
+; CHECK-NEXT:  LBB0_40: ## %sw.bb566
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $2, %r13d
+; CHECK-NEXT:    movl $20, %r12d
 ; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_40: ## %sw.bb566
+; CHECK-NEXT:  LBB0_19: ## %sw.bb243
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $20, %r13d
+; CHECK-NEXT:    movl $2, %r12d
 ; CHECK-NEXT:    jmp LBB0_21
 ; CHECK-NEXT:  LBB0_33: ## %if.end517.loopexitsplit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    incq %r12
+; CHECK-NEXT:    incq %r14
 ; CHECK-NEXT:  LBB0_34: ## %if.end517
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    leal -324(%r13), %eax
+; CHECK-NEXT:    leal -324(%r12), %eax
 ; CHECK-NEXT:    cmpl $59, %eax
 ; CHECK-NEXT:    ja LBB0_35
 ; CHECK-NEXT:  ## %bb.57: ## %if.end517
@@ -219,11 +219,11 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    jb LBB0_38
 ; CHECK-NEXT:  LBB0_35: ## %if.end517
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    cmpl $11, %r13d
+; CHECK-NEXT:    cmpl $11, %r12d
 ; CHECK-NEXT:    je LBB0_38
 ; CHECK-NEXT:  ## %bb.36: ## %if.end517
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    cmpl $24, %r13d
+; CHECK-NEXT:    cmpl $24, %r12d
 ; CHECK-NEXT:    je LBB0_38
 ; CHECK-NEXT:  ## %bb.37: ## %if.then532
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
@@ -233,15 +233,14 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  LBB0_38: ## %for.cond534
 ; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=1
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    testb %bl, %bl
 ; CHECK-NEXT:    jne LBB0_38
 ; CHECK-NEXT:  ## %bb.39: ## %for.cond542.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    movb $0, (%r12)
-; CHECK-NEXT:    movl %r14d, %r13d
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rsi
-; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
+; CHECK-NEXT:    testb %bl, %bl
+; CHECK-NEXT:    movb $0, (%r14)
+; CHECK-NEXT:    movl %ebp, %r12d
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 ## 8-byte Reload
 ; CHECK-NEXT:    jmp LBB0_21
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_42: ## %while.cond864
@@ -256,30 +255,44 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    jmp LBB0_25
 ; CHECK-NEXT:  LBB0_11:
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    xorl %r13d, %r13d
+; CHECK-NEXT:    movl $0, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
+; CHECK-NEXT:    xorl %r12d, %r12d
 ; CHECK-NEXT:  LBB0_22: ## %while.end1465
-; CHECK-NEXT:    incl %r13d
-; CHECK-NEXT:    cmpl $16, %r13d
+; CHECK-NEXT:    incl %r12d
+; CHECK-NEXT:    cmpl $16, %r12d
 ; CHECK-NEXT:    ja LBB0_50
 ; CHECK-NEXT:  ## %bb.23: ## %while.end1465
 ; CHECK-NEXT:    movl $83969, %eax ## imm = 0x14801
-; CHECK-NEXT:    btl %r13d, %eax
+; CHECK-NEXT:    btl %r12d, %eax
 ; CHECK-NEXT:    jae LBB0_50
 ; CHECK-NEXT:  ## %bb.24:
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
+; CHECK-NEXT:    xorl %ebx, %ebx
 ; CHECK-NEXT:  LBB0_48: ## %if.then1477
 ; CHECK-NEXT:    movl $1, %edx
 ; CHECK-NEXT:    callq _write
-; CHECK-NEXT:    subq %rbp, %rbx
+; CHECK-NEXT:    subq %rbx, %r14
 ; CHECK-NEXT:    movq _syHistory@{{.*}}(%rip), %rax
-; CHECK-NEXT:    leaq 8189(%rbx,%rax), %rax
+; CHECK-NEXT:    leaq 8189(%r14,%rax), %rax
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_49: ## %for.body1723
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    decq %rax
 ; CHECK-NEXT:    jmp LBB0_49
+; CHECK-NEXT:  LBB0_47: ## %if.then1477.loopexit
+; CHECK-NEXT:    movq %r14, %rbx
+; CHECK-NEXT:    jmp LBB0_48
+; CHECK-NEXT:  LBB0_16: ## %while.cond635.preheader
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    je LBB0_41
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_17: ## %for.body643.us
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp LBB0_17
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_41: ## %while.cond661
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp LBB0_41
 ; CHECK-NEXT:  LBB0_50: ## %for.cond1480.preheader
 ; CHECK-NEXT:    movl $512, %eax ## imm = 0x200
 ; CHECK-NEXT:    cmpq %rax, %rax
@@ -289,14 +302,15 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne LBB0_54
 ; CHECK-NEXT:  ## %bb.52: ## %while.body1679.preheader
-; CHECK-NEXT:    incl %ebp
-; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:    incl {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Folded Spill
 ; CHECK-NEXT:  LBB0_53: ## %while.body1679
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq (%rbx), %rdi
+; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
+; CHECK-NEXT:    movq (%rax), %rdi
 ; CHECK-NEXT:    callq _fileno
-; CHECK-NEXT:    movslq %ebp, %rax
-; CHECK-NEXT:    leal 1(%rax), %ebp
+; CHECK-NEXT:    movslq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 4-byte Folded Reload
+; CHECK-NEXT:    leal 1(%rax), %ecx
+; CHECK-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) ## 4-byte Spill
 ; CHECK-NEXT:    cmpq %rax, %rax
 ; CHECK-NEXT:    jl LBB0_53
 ; CHECK-NEXT:  LBB0_54: ## %while.cond1683.preheader
@@ -304,22 +318,6 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:  LBB0_55: ## %if.then.i
 ; CHECK-NEXT:    ud2
-; CHECK-NEXT:  LBB0_47: ## %if.then1477.loopexit
-; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx ## 8-byte Reload
-; CHECK-NEXT:    movq %rbx, %rbp
-; CHECK-NEXT:    jmp LBB0_48
-; CHECK-NEXT:  LBB0_16: ## %while.cond635.preheader
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je LBB0_41
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_17: ## %for.body643.us
-; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp LBB0_17
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_41: ## %while.cond661
-; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    jmp LBB0_41
 entry:
   %sub.ptr.rhs.cast646 = ptrtoint i8* %line to i64
   %old = alloca [512 x i8], align 16
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 9f9d75cb36ca8..6b29bd2207afe 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -296,8 +296,7 @@ define void @test_shl_i128(i128 %x, i128 %a, i128* nocapture %r) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    movl %ebp, %esi
@@ -534,8 +533,7 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocaptur
 ; X86-NEXT:  .LBB6_9: # %entry
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    shrl %cl, %ebp
 ; X86-NEXT:    testb $32, %cl
@@ -795,9 +793,8 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocaptur
 ; X86-NEXT:  # %bb.4: # %entry
 ; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:  .LBB7_5: # %entry
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %edi
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    sarl %cl, %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -835,8 +832,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocaptur
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:  .LBB7_9: # %entry
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sarl %cl, %esi
 ; X86-NEXT:    testb $32, %cl
@@ -850,8 +846,7 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocaptur
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb %dl, %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    shldl %cl, %ebx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -1059,12 +1054,11 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocapture
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    subl $64, %eax
@@ -1130,9 +1124,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocapture
 ; X86-NEXT:    movl %ecx, %ebp
 ; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shll %cl, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    shll %cl, %esi
 ; X86-NEXT:    testb $32, %dl
 ; X86-NEXT:    movl $0, %edi
@@ -1210,8 +1202,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, <2 x i128>* nocapture
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:  .LBB8_23: # %entry
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shll %cl, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/sqrt-partial.ll b/llvm/test/CodeGen/X86/sqrt-partial.ll
index 7ed68c1084998..48914d8ed44e0 100644
--- a/llvm/test/CodeGen/X86/sqrt-partial.ll
+++ b/llvm/test/CodeGen/X86/sqrt-partial.ll
@@ -38,7 +38,7 @@ define float @f(float %val) nounwind {
 define double @d(double %val) nounwind {
 ; SSE-LABEL: d:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-NEXT:    ucomisd %xmm1, %xmm0
 ; SSE-NEXT:    jb .LBB1_2
 ; SSE-NEXT:  # %bb.1: # %.split
@@ -49,7 +49,7 @@ define double @d(double %val) nounwind {
 ;
 ; AVX-LABEL: d:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vucomisd %xmm1, %xmm0
 ; AVX-NEXT:    jb .LBB1_2
 ; AVX-NEXT:  # %bb.1: # %.split
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4f26db8869d92..7ce16bbc3d420 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -98,8 +98,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    addl %esi, %ecx
 ; X86-NEXT:    adcl $0, %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    mull %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    mull %edx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
diff --git a/llvm/test/CodeGen/X86/undef-label.ll b/llvm/test/CodeGen/X86/undef-label.ll
index b4be383d55ddc..56e0ca907f8e1 100644
--- a/llvm/test/CodeGen/X86/undef-label.ll
+++ b/llvm/test/CodeGen/X86/undef-label.ll
@@ -11,7 +11,7 @@ define void @xyz() {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $g, %eax
 ; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB0_1
 ; CHECK-NEXT:    jnp .LBB0_2
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index fc3233327a558..bf2ea5e067cc1 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -21,7 +21,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movq %rax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -125,13 +125,13 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movq %rax, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
 ; SSE-NEXT:    movq %rax, %xmm3
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
 ; SSE-NEXT:    movq %rax, %xmm0
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
@@ -335,7 +335,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_4i32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rcx
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    movd %ecx, %xmm1
@@ -409,7 +409,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -482,7 +482,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -734,13 +734,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
 ; SSE-NEXT:    movd %eax, %xmm2
-; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
 ; SSE-NEXT:    movd %eax, %xmm1
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    movd %eax, %xmm0
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 269879e7f1a31..1d0106b75a84f 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -27,8 +27,8 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_2i64_to_2f64:
@@ -38,7 +38,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_2i64_to_2f64:
@@ -47,7 +47,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_2i64_to_2f64:
@@ -56,7 +56,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_2i64_to_2f64:
@@ -65,7 +65,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
@@ -237,16 +237,16 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm3
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE2-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT:    movaps %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm3, %xmm1
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_4i64_to_4f64:
@@ -256,14 +256,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE41-NEXT:    pextrq $1, %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: sitofp_4i64_to_4f64:
@@ -273,12 +273,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX1-NEXT:    vmovq %xmm1, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX1-NEXT:    vmovq %xmm0, %rax
 ; AVX1-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -289,12 +289,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX2-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX2-NEXT:    vmovq %xmm1, %rax
 ; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX2-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -305,12 +305,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -321,12 +321,12 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm1, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -1204,7 +1204,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1235,7 +1235,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
 ; SSE2-NEXT:    retq
 ;
@@ -1274,7 +1274,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1304,7 +1304,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; SSE2-NEXT:    movq %xmm0, %rax
 ; SSE2-NEXT:    xorps %xmm0, %xmm0
 ; SSE2-NEXT:    cvtsi2ss %rax, %xmm0
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movq {{.*#+}} xmm0 = xmm1[0],zero
 ; SSE2-NEXT:    retq
 ;
@@ -1342,7 +1342,7 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -1927,7 +1927,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2074,7 +2074,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -2216,7 +2216,7 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512VL-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
 ; AVX512VL-NEXT:    retq
 ;
@@ -3023,7 +3023,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_2i64_to_2f64:
@@ -3034,7 +3034,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_2i64_to_2f64:
@@ -3044,7 +3044,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; VEX-NEXT:    vmovq %xmm0, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
@@ -3054,7 +3054,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
@@ -3064,7 +3064,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
@@ -3220,7 +3220,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE2-NEXT:    movq %xmm1, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm1
@@ -3228,7 +3228,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE2-NEXT:    movq %xmm2, %rax
 ; SSE2-NEXT:    xorps %xmm2, %xmm2
 ; SSE2-NEXT:    cvtsi2sd %rax, %xmm2
-; SSE2-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: sitofp_load_4i64_to_4f64:
@@ -3240,64 +3240,64 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ; SSE41-NEXT:    movq %xmm0, %rax
 ; SSE41-NEXT:    xorps %xmm0, %xmm0
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm0
-; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE41-NEXT:    pextrq $1, %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm2, %xmm2
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm2
 ; SSE41-NEXT:    movq %xmm1, %rax
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
 ; SSE41-NEXT:    cvtsi2sd %rax, %xmm1
-; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE41-NEXT:    retq
 ;
 ; VEX-LABEL: sitofp_load_4i64_to_4f64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovapd (%rdi), %xmm0
 ; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; VEX-NEXT:    vpextrq $1, %xmm1, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; VEX-NEXT:    vmovq %xmm1, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; VEX-NEXT:    vpextrq $1, %xmm0, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; VEX-NEXT:    vmovq %xmm0, %rax
 ; VEX-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; VEX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; VEX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; VEX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; VEX-NEXT:    retq
 ;
 ; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovapd (%rdi), %xmm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm1, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovapd (%rdi), %xmm0
 ; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm2, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm1, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm2
 ; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512VL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -4288,7 +4288,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; VEX-LABEL: sitofp_load_8i64_to_8f32:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vmovdqa (%rdi), %xmm0
+; VEX-NEXT:    vmovaps (%rdi), %xmm0
 ; VEX-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; VEX-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
@@ -4319,7 +4319,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
@@ -4350,7 +4350,7 @@ define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
@@ -4648,7 +4648,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; VEX-LABEL: uitofp_load_4i64_to_4f32:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovdqa (%rdi), %xmm2
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vmovaps 16(%rdi), %xmm0
 ; VEX-NEXT:    vpextrq $1, %xmm2, %rax
 ; VEX-NEXT:    testq %rax, %rax
 ; VEX-NEXT:    js .LBB81_1
@@ -5167,7 +5167,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ; VEX-LABEL: uitofp_load_8i64_to_8f32:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovdqa (%rdi), %xmm1
-; VEX-NEXT:    vmovdqa 16(%rdi), %xmm0
+; VEX-NEXT:    vmovaps 16(%rdi), %xmm0
 ; VEX-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; VEX-NEXT:    vmovdqa 48(%rdi), %xmm3
 ; VEX-NEXT:    vpextrq $1, %xmm4, %rax
@@ -5293,7 +5293,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512F-NEXT:    vmovdqa 48(%rdi), %xmm3
@@ -5324,7 +5324,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
 ;
 ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512VL-NEXT:    vmovdqa 32(%rdi), %xmm2
 ; AVX512VL-NEXT:    vmovdqa 48(%rdi), %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index b5dff70e234e4..8cf8cab8b79b1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2914,56 +2914,40 @@ define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) {
 ; SSE2-LABEL: shuffle_extract_concat_insert:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    pextrw $2, %xmm1, %ecx
-; SSE2-NEXT:    pextrw $5, %xmm2, %edx
-; SSE2-NEXT:    pextrw $7, %xmm2, %esi
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE2-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSE2-NEXT:    pinsrw $5, %edx, %xmm0
-; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE2-NEXT:    pinsrw $7, %esi, %xmm0
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_extract_concat_insert:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    pextrw $2, %xmm1, %eax
-; SSSE3-NEXT:    pextrw $5, %xmm2, %ecx
-; SSSE3-NEXT:    pextrw $7, %xmm2, %edx
-; SSSE3-NEXT:    movd %xmm1, %esi
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,8,9,14,15,12,13,14,15]
-; SSSE3-NEXT:    pinsrw $4, %eax, %xmm0
-; SSSE3-NEXT:    pinsrw $5, %ecx, %xmm0
-; SSSE3-NEXT:    pinsrw $6, %esi, %xmm0
-; SSSE3-NEXT:    pinsrw $7, %edx, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: shuffle_extract_concat_insert:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movd %xmm1, %eax
-; SSE41-NEXT:    pextrw $2, %xmm1, %ecx
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
 ; SSE41-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,8,9,14,15,12,13,14,15]
-; SSE41-NEXT:    pinsrw $4, %ecx, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7]
-; SSE41-NEXT:    pinsrw $6, %eax, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: shuffle_extract_concat_insert:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovd %xmm1, %eax
-; AVX-NEXT:    vpextrw $2, %xmm1, %ecx
-; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,0,1,14,15,8,9,14,15,12,13,14,15]
-; AVX-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5],xmm0[6,7]
-; AVX-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX-NEXT:    retq
   %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %a0 = extractelement <8 x i16> %a, i32 0
diff --git a/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll
new file mode 100644
index 0000000000000..9555ce032db90
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-stackprobe-overflow.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-windows-gnu | FileCheck %s
+
+define void @foo() unnamed_addr #0 {
+start:
+  %b = alloca i64, align 8
+  %c = alloca [4294967295 x i8], align 1
+  ret void
+}
+
+attributes #0 = { nonlazybind uwtable "probe-stack"="probe_stack" "target-cpu"="x86-64" }
+
+; CHECK-LABEL: foo:
+; CHECK: movabsq $4294967304, %rax
+; CHECK-NEXT: callq probe_stack
diff --git a/llvm/test/DebugInfo/Inputs/.build-id/ab/b50d82b6bdc861.debug b/llvm/test/DebugInfo/Inputs/.build-id/ab/b50d82b6bdc861.debug
new file mode 100755
index 0000000000000..5eafa358a9c69
Binary files /dev/null and b/llvm/test/DebugInfo/Inputs/.build-id/ab/b50d82b6bdc861.debug differ
diff --git a/llvm/test/DebugInfo/Inputs/dwarfdump-macro.dwo b/llvm/test/DebugInfo/Inputs/dwarfdump-macro.dwo
new file mode 100644
index 0000000000000..5b0c16b745c40
Binary files /dev/null and b/llvm/test/DebugInfo/Inputs/dwarfdump-macro.dwo differ
diff --git a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir
index d2745874d5ef9..9001c8ba8eea2 100644
--- a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir
+++ b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir
@@ -129,6 +129,19 @@ body:             |
 ---
 name:            callee
 tracksRegLiveness: true
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r11', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
 body:             |
   bb.0:
     successors: %bb.2(0x30000000), %bb.1(0x50000000)
diff --git a/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir
new file mode 100644
index 0000000000000..8ae628af2c099
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir
@@ -0,0 +1,187 @@
+# RUN: llc -mtriple hexagon -debug-entry-values -start-after=machineverifier -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s
+
+# Based on the following C reproducer:
+#
+# int ga, gb, gc;
+#
+# extern void callee(int, int, int);
+#
+# void caller() {
+#   int a = ga;
+#   int b = gb;
+#   int c = gc;
+#
+#   // Clobber all integer registers.
+#   __asm("" : : :
+#         "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
+#         "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20",
+#         "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28");
+#
+#   callee(a, b, c);
+# }
+
+--- |
+  target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+  target triple = "hexagon"
+
+  @ga = common global i32 0, align 4
+  @gb = common global i32 0, align 4
+  @gc = common global i32 0, align 4
+
+  ; Function Attrs: nounwind
+  define void @caller() #0 !dbg !12 {
+  entry:
+    %0 = load i32, i32* @ga, align 4, !dbg !15
+    %1 = load i32, i32* @gb, align 4, !dbg !16
+    %2 = load i32, i32* @gc, align 4, !dbg !17
+    call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28}"(), !dbg !18, !srcloc !19
+    call void @callee(i32 %0, i32 %1, i32 %2), !dbg !20
+    ret void, !dbg !21
+  }
+
+  declare !dbg !4 void @callee(i32, i32, i32)
+
+  attributes #0 = { nounwind }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2, nameTableKind: None)
+  !1 = !DIFile(filename: "h.c", directory: "/")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "callee", scope: !1, file: !1, line: 3, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 7, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{!"clang version 10.0.0"}
+  !12 = distinct !DISubprogram(name: "caller", scope: !1, file: !1, line: 5, type: !13, scopeLine: 5, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{null}
+  !15 = !DILocation(line: 6, scope: !12)
+  !16 = !DILocation(line: 7, scope: !12)
+  !17 = !DILocation(line: 8, scope: !12)
+  !18 = !DILocation(line: 11, scope: !12)
+  !19 = !{i32 158}
+  !20 = !DILocation(line: 16, scope: !12)
+  !21 = !DILocation(line: 17, scope: !12)
+
+...
+---
+name:            caller
+tracksRegLiveness: true
+frameInfo:
+  stackSize:       64
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -48, size: 8, alignment: 8, callee-saved-register: '$d13' }
+  - { id: 1, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '$d12' }
+  - { id: 2, type: spill-slot, offset: -32, size: 8, alignment: 8, callee-saved-register: '$d11' }
+  - { id: 3, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '$d10' }
+  - { id: 4, type: spill-slot, offset: -16, size: 8, alignment: 8, callee-saved-register: '$d9' }
+  - { id: 5, type: spill-slot, offset: -8, size: 8, alignment: 8, callee-saved-register: '$d8' }
+stack:
+  - { id: 0, type: spill-slot, offset: -52, size: 4, alignment: 4 }
+  - { id: 1, type: spill-slot, offset: -56, size: 4, alignment: 4 }
+  - { id: 2, type: spill-slot, offset: -60, size: 4, alignment: 4 }
+callSites:
+  - { bb: 0, offset: 40, fwdArgRegs:
+      - { arg: 0, reg: '$r0' }
+      - { arg: 1, reg: '$r1' }
+      - { arg: 2, reg: '$r2' } }
+body:             |
+  bb.0.entry:
+    liveins: $d8, $d9, $d10, $d11, $d12, $d13, $d8, $d9, $d10, $d11, $d12, $d13
+
+    BUNDLE implicit-def $r29, implicit-def $r30, implicit $r29, implicit killed $framekey, implicit killed $framelimit, implicit killed $r30, implicit killed $r31, implicit killed $d8, debug-location !15 {
+      $r29 = S2_allocframe $r29, 64, implicit-def $r30, implicit killed $framekey, implicit killed $framelimit, implicit killed $r30, implicit killed $r31, debug-location !15 :: (store 4 into stack)
+      S2_storerd_io internal $r29, -16, killed $d8, debug-location !15 :: (store 8 into %fixed-stack.5)
+    }
+    CFI_INSTRUCTION def_cfa $r30, 8
+    CFI_INSTRUCTION offset $r31, -4
+    CFI_INSTRUCTION offset $r30, -8
+    CFI_INSTRUCTION offset $r17, -12
+    CFI_INSTRUCTION offset $r16, -16
+    CFI_INSTRUCTION offset $r19, -20
+    CFI_INSTRUCTION offset $r18, -24
+    CFI_INSTRUCTION offset $r21, -28
+    CFI_INSTRUCTION offset $r20, -32
+    CFI_INSTRUCTION offset $r23, -36
+    CFI_INSTRUCTION offset $r22, -40
+    CFI_INSTRUCTION offset $r25, -44
+    CFI_INSTRUCTION offset $r24, -48
+    CFI_INSTRUCTION offset $r27, -52
+    CFI_INSTRUCTION offset $r26, -56
+    BUNDLE implicit $r29, implicit killed $d9, implicit killed $d10, debug-location !15 {
+      S2_storerd_io $r29, 48, killed $d9, debug-location !15 :: (store 8 into %fixed-stack.4)
+      S2_storerd_io $r29, 40, killed $d10, debug-location !15 :: (store 8 into %fixed-stack.3)
+    }
+    BUNDLE implicit $r29, implicit killed $d11, implicit killed $d12, debug-location !15 {
+      S2_storerd_io $r29, 32, killed $d11, debug-location !15 :: (store 8 into %fixed-stack.2)
+      S2_storerd_io $r29, 24, killed $d12, debug-location !15 :: (store 8 into %fixed-stack.1)
+    }
+    BUNDLE implicit-def $r0, implicit $r29, implicit killed $d13, implicit $gp, debug-location !15 {
+      S2_storerd_io $r29, 16, killed $d13, debug-location !15 :: (store 8 into %fixed-stack.0)
+      renamable $r0 = L2_loadrigp @ga, implicit $gp, debug-location !15 :: (dereferenceable load 4 from @ga)
+    }
+    BUNDLE implicit-def $r0, implicit $r29, implicit killed $r0, implicit $gp, debug-location !16 {
+      S2_storeri_io $r29, 12, killed renamable $r0, debug-location !16 :: (store 4 into %stack.0)
+      renamable $r0 = L2_loadrigp @gb, implicit $gp, debug-location !16 :: (dereferenceable load 4 from @gb)
+    }
+    BUNDLE implicit-def $r0, implicit $r29, implicit killed $r0, implicit killed $gp, debug-location !17 {
+      S2_storeri_io $r29, 8, killed renamable $r0, debug-location !17 :: (store 4 into %stack.1)
+      renamable $r0 = L2_loadrigp @gc, implicit killed $gp, debug-location !17 :: (dereferenceable load 4 from @gc)
+    }
+    S2_storeri_io $r29, 4, killed renamable $r0, debug-location !18 :: (store 4 into %stack.2)
+    INLINEASM &"", 1, 12, implicit-def dead early-clobber $r0, 12, implicit-def dead early-clobber $r1, 12, implicit-def dead early-clobber $r2, 12, implicit-def dead early-clobber $r3, 12, implicit-def dead early-clobber $r4, 12, implicit-def dead early-clobber $r5, 12, implicit-def dead early-clobber $r6, 12, implicit-def dead early-clobber $r7, 12, implicit-def dead early-clobber $r8, 12, implicit-def dead early-clobber $r9, 12, implicit-def dead early-clobber $r10, 12, implicit-def dead early-clobber $r11, 12, implicit-def dead early-clobber $r12, 12, implicit-def dead early-clobber $r13, 12, implicit-def dead early-clobber $r14, 12, implicit-def dead early-clobber $r15, 12, implicit-def dead early-clobber $r16, 12, implicit-def dead early-clobber $r17, 12, implicit-def dead early-clobber $r18, 12, implicit-def dead early-clobber $r19, 12, implicit-def dead early-clobber $r20, 12, implicit-def dead early-clobber $r21, 12, implicit-def dead early-clobber $r22, 12, implicit-def dead early-clobber $r23, 12, implicit-def dead early-clobber $r24, 12, implicit-def dead early-clobber $r25, 12, implicit-def dead early-clobber $r26, 12, implicit-def dead early-clobber $r27, 12, implicit-def dead early-clobber $r28, !19, debug-location !18
+    BUNDLE implicit-def $r0, implicit-def $r1, implicit $r29, debug-location !20 {
+      $r0 = L2_loadri_io $r29, 12, debug-location !20 :: (load 4 from %stack.0)
+      $r1 = L2_loadri_io $r29, 8, debug-location !20 :: (load 4 from %stack.1)
+    }
+    BUNDLE implicit-def dead $r2, implicit-def dead $pc, implicit-def dead $r31, implicit-def $r29, implicit $r29, implicit killed $r0, implicit killed $r1, debug-location !20 {
+      $r2 = L2_loadri_io $r29, 4, debug-location !20 :: (load 4 from %stack.2)
+      J2_call @callee, hexagoncsr, implicit-def dead $pc, implicit-def dead $r31, implicit $r29, implicit killed $r0, implicit killed $r1, implicit internal killed $r2, implicit-def $r29, debug-location !20
+    }
+    BUNDLE implicit-def $d8, implicit-def $r16, implicit-def $r17, implicit-def $d9, implicit-def $r18, implicit-def $r19, implicit $r29, debug-location !21 {
+      $d8 = L2_loadrd_io $r29, 56, debug-location !21 :: (load 8 from %fixed-stack.5)
+      $d9 = L2_loadrd_io $r29, 48, debug-location !21 :: (load 8 from %fixed-stack.4)
+    }
+    BUNDLE implicit-def $d10, implicit-def $r20, implicit-def $r21, implicit-def $d11, implicit-def $r22, implicit-def $r23, implicit $r29, debug-location !21 {
+      $d10 = L2_loadrd_io $r29, 40, debug-location !21 :: (load 8 from %fixed-stack.3)
+      $d11 = L2_loadrd_io $r29, 32, debug-location !21 :: (load 8 from %fixed-stack.2)
+    }
+    BUNDLE implicit-def $d12, implicit-def $r24, implicit-def $r25, implicit-def $d13, implicit-def $r26, implicit-def $r27, implicit killed $r29, debug-location !21 {
+      $d12 = L2_loadrd_io $r29, 24, debug-location !21 :: (load 8 from %fixed-stack.1)
+      $d13 = L2_loadrd_io killed $r29, 16, debug-location !21 :: (load 8 from %fixed-stack.0)
+    }
+    $d15 = L4_return killed $r30, implicit-def $pc, implicit-def $r29, implicit killed $framekey, implicit-def dead $pc, implicit $d8, implicit $d9, implicit $d10, implicit $d11, implicit $d12, implicit $d13, debug-location !21
+
+...
+
+# Verify that call site entries are emitted for all three parameters.
+# Previously the code that's looking for instructions to describe parameters
+# with would stop when reaching the bundle header for the bundled call,
+# resulting in $r0 and $r1 not being described.
+#
+# Please note that at the time of creating this test the Hexagon target did not
+# support call site information, so the "callSites" array has been manually
+# added.
+
+# CHECK: DW_TAG_GNU_call_site_parameter
+# CHECK-NEXT: DW_AT_location      (DW_OP_reg2 R2)
+# CHECK-NEXT: DW_AT_GNU_call_site_value   (DW_OP_breg29 R29+4, DW_OP_deref_size 0x4)
+
+# CHECK: DW_TAG_GNU_call_site_parameter
+# CHECK-NEXT: DW_AT_location      (DW_OP_reg1 R1)
+# CHECK-NEXT: DW_AT_GNU_call_site_value   (DW_OP_breg29 R29+8, DW_OP_deref_size 0x4)
+
+# CHECK: DW_TAG_GNU_call_site_parameter
+# CHECK-NEXT: DW_AT_location      (DW_OP_reg0 R0)
+# CHECK-NEXT: DW_AT_GNU_call_site_value   (DW_OP_breg29 R29+12, DW_OP_deref_size 0x4)
diff --git a/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir
new file mode 100644
index 0000000000000..8d121c3a30b91
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir
@@ -0,0 +1,121 @@
+# RUN: llc -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s
+#
+#extern void fn1 (int, int, int);
+#
+#__attribute__((noinline))
+#int
+#fn2 (int a, int b, int c) {
+#  int q = 2 + a;
+#
+#  fn1 (5, 6, q);
+#
+#  b = b + 7;
+#  if (b < 17)
+#    return 1;
+#  else
+#    return 0;
+#}
+#
+# CHECK: ![[ARG_A:.*]] = !DILocalVariable(name: "a"
+# CHECK: ![[ARG_B:.*]] = !DILocalVariable(name: "b"
+# CHECK: ![[ARG_C:.*]] = !DILocalVariable(name: "c"
+# CHECK: DBG_VALUE $edi, $noreg, ![[ARG_A]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: DBG_VALUE $edx, $noreg, ![[ARG_C]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: DBG_VALUE $edi, $noreg, ![[ARG_A]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK-NOT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+#
+--- |
+  ; ModuleID = 'test.c'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: noinline nounwind uwtable
+  define dso_local i32 @fn2(i32 %a, i32 %b, i32 %c) local_unnamed_addr !dbg !12 {
+  entry:
+    call void @llvm.dbg.value(metadata i32 %a, metadata !16, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %c, metadata !18, metadata !DIExpression()), !dbg !20
+    %add = add nsw i32 %a, 2, !dbg !21
+    call void @llvm.dbg.value(metadata i32 %add, metadata !19, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 6, i32 %add), !dbg !22
+    call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 7, DW_OP_stack_value)), !dbg !20
+    %cmp = icmp slt i32 %b, 10, !dbg !23
+    %. = zext i1 %cmp to i32, !dbg !25
+    ret i32 %., !dbg !26
+  }
+
+  declare !dbg !4 dso_local void @fn1(i32, i32, i32) local_unnamed_addr
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/dir")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 2, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{!"clang version 10.0.0"}
+  !12 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 5, type: !13, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!7, !7, !7, !7}
+  !15 = !{!16, !17, !18, !19}
+  !16 = !DILocalVariable(name: "a", arg: 1, scope: !12, file: !1, line: 5, type: !7)
+  !17 = !DILocalVariable(name: "b", arg: 2, scope: !12, file: !1, line: 5, type: !7)
+  !18 = !DILocalVariable(name: "c", arg: 3, scope: !12, file: !1, line: 5, type: !7)
+  !19 = !DILocalVariable(name: "q", scope: !12, file: !1, line: 7, type: !7)
+  !20 = !DILocation(line: 0, scope: !12)
+  !21 = !DILocation(line: 7, column: 15, scope: !12)
+  !22 = !DILocation(line: 9, column: 5, scope: !12)
+  !23 = !DILocation(line: 12, column: 11, scope: !24)
+  !24 = distinct !DILexicalBlock(scope: !12, file: !1, line: 12, column: 9)
+  !25 = !DILocation(line: 0, scope: !24)
+  !26 = !DILocation(line: 16, column: 1, scope: !12)
+
+...
+---
+name:            fn2
+alignment:       16
+callSites:
+  - { bb: 0, offset: 14, fwdArgRegs:
+      - { arg: 0, reg: '$edi' }
+      - { arg: 1, reg: '$esi' }
+      - { arg: 2, reg: '$edx' } }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi, $rbx
+
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    DBG_VALUE $esi, $noreg, !17, !DIExpression(), debug-location !20
+    DBG_VALUE $edx, $noreg, !18, !DIExpression(), debug-location !20
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset $rbx, -16
+    $ebx = MOV32rr $esi
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    renamable $edi = KILL $edi, implicit-def $rdi
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    renamable $edx = LEA64_32r killed renamable $rdi, 1, $noreg, 2, $noreg, debug-location !21
+    DBG_VALUE $edx, $noreg, !19, !DIExpression(), debug-location !20
+    $edi = MOV32ri 5, debug-location !22
+    $esi = MOV32ri 6, debug-location !22
+    CALL64pcrel32 @fn1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit $edx, implicit-def $rsp, implicit-def $ssp, debug-location !22
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(DW_OP_plus_uconst, 7, DW_OP_stack_value), debug-location !20
+    renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, debug-location !23
+    CMP32ri8 killed renamable $ebx, 10, implicit-def $eflags, debug-location !23
+    renamable $al = SETCCr 12, implicit killed $eflags, implicit killed $eax, implicit-def $eax, debug-location !23
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !26
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !26
+    RETQ $eax, debug-location !26
+
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir
new file mode 100644
index 0000000000000..2396daada876e
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir
@@ -0,0 +1,179 @@
+# RUN: llc -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s
+#
+# The test case was artificially adjusted, in order to make proper diamond basic
+# block structure relevant to the debug entry values propagation.
+#
+# CHECK: ![[ARG_B:.*]] = !DILocalVariable(name: "b"
+# CHECK: bb.0.entry
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: bb.1.if.then
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: $ebx = MOV32rr $esi
+# CHECK-NEXT: DBG_VALUE $ebx, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK-NEXT: $esi = MOV32ri 5
+# CHECK-NEXT: $ebx = MOV32ri 1
+# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.2.if.else
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: $ebx = MOV32rr $esi
+# CHECK-NEXT: DBG_VALUE $ebx, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK-NEXT: $esi = MOV32ri 1
+# CHECK-NEXT: $ebx = MOV32ri 2
+# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.3.if.end
+# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+#
+--- |
+  ; ModuleID = 'test.c'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: noinline nounwind uwtable
+  define dso_local i32 @fn2(i32 %a, i32 %b, i32 %c) local_unnamed_addr !dbg !12 {
+  entry:
+    call void @llvm.dbg.value(metadata i32 %a, metadata !16, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %c, metadata !18, metadata !DIExpression()), !dbg !20
+    %add = add nsw i32 %a, 2, !dbg !21
+    call void @llvm.dbg.value(metadata i32 %add, metadata !19, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 6, i32 %add) #3, !dbg !22
+    %cmp = icmp slt i32 %b, 17, !dbg !23
+    br i1 %cmp, label %if.then, label %if.else, !dbg !25
+
+  if.then:                                          ; preds = %entry
+    %add1 = add nsw i32 %b, 7, !dbg !26
+    call void @llvm.dbg.value(metadata i32 %add1, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 %add1, i32 %c) #3, !dbg !28
+    br label %if.end, !dbg !29
+
+  if.else:                                          ; preds = %entry
+    %add2 = add nuw nsw i32 %b, 1, !dbg !30
+    call void @llvm.dbg.value(metadata i32 %add2, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 1, i32 %add2, i32 %c) #3, !dbg !32
+    br label %if.end
+
+  if.end:                                           ; preds = %if.else, %if.then
+    %b.addr.0 = phi i32 [ %add1, %if.then ], [ %add2, %if.else ], !dbg !33
+    call void @llvm.dbg.value(metadata i32 %b.addr.0, metadata !17, metadata !DIExpression()), !dbg !20
+    ret i32 %b.addr.0, !dbg !34
+  }
+
+  declare !dbg !4 dso_local void @fn1(i32, i32, i32) local_unnamed_addr
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/dir")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 2, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{!"clang version 10.0.0"}
+  !12 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 5, type: !13, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!7, !7, !7, !7}
+  !15 = !{!16, !17, !18, !19}
+  !16 = !DILocalVariable(name: "a", arg: 1, scope: !12, file: !1, line: 5, type: !7)
+  !17 = !DILocalVariable(name: "b", arg: 2, scope: !12, file: !1, line: 5, type: !7)
+  !18 = !DILocalVariable(name: "c", arg: 3, scope: !12, file: !1, line: 5, type: !7)
+  !19 = !DILocalVariable(name: "q", scope: !12, file: !1, line: 7, type: !7)
+  !20 = !DILocation(line: 0, scope: !12)
+  !21 = !DILocation(line: 7, column: 15, scope: !12)
+  !22 = !DILocation(line: 9, column: 5, scope: !12)
+  !23 = !DILocation(line: 11, column: 11, scope: !24)
+  !24 = distinct !DILexicalBlock(scope: !12, file: !1, line: 11, column: 9)
+  !25 = !DILocation(line: 11, column: 9, scope: !12)
+  !26 = !DILocation(line: 12, column: 13, scope: !27)
+  !27 = distinct !DILexicalBlock(scope: !24, file: !1, line: 11, column: 17)
+  !28 = !DILocation(line: 13, column: 8, scope: !27)
+  !29 = !DILocation(line: 14, column: 5, scope: !27)
+  !30 = !DILocation(line: 15, column: 13, scope: !31)
+  !31 = distinct !DILexicalBlock(scope: !24, file: !1, line: 14, column: 12)
+  !32 = !DILocation(line: 16, column: 7, scope: !31)
+  !33 = !DILocation(line: 0, scope: !24)
+  !34 = !DILocation(line: 19, column: 5, scope: !12)
+
+...
+---
+name:            fn2
+alignment:       16
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+      callee-saved-register: '$rbx', callee-saved-restored: true, debug-info-variable: '',
+      debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbp', callee-saved-restored: true, debug-info-variable: '',
+      debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi, $edx, $esi, $rbp, $rbx
+
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    DBG_VALUE $esi, $noreg, !17, !DIExpression(), debug-location !20
+    DBG_VALUE $edx, $noreg, !18, !DIExpression(), debug-location !20
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 24
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 32
+    CFI_INSTRUCTION offset $rbx, -24
+    CFI_INSTRUCTION offset $rbp, -16
+    $ebp = MOV32rr $edx
+    DBG_VALUE $ebp, $noreg, !18, !DIExpression(), debug-location !20
+    renamable $edi = KILL $edi, implicit-def $rdi
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    renamable $edx = LEA64_32r killed renamable $rdi, 1, $noreg, 2, $noreg, debug-location !21
+    DBG_VALUE $edx, $noreg, !19, !DIExpression(), debug-location !20
+    $edi = MOV32ri 5, debug-location !22
+    CMP32ri8 renamable $ebp, 16, implicit-def $eflags, debug-location !23
+    JCC_1 %bb.2, 15, implicit killed $eflags, debug-location !25
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx, $esi
+
+    $ebx = MOV32rr $esi
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $esi = MOV32ri 5, debug-location !28
+    $ebx = MOV32ri 1
+    JMP_1 %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx, $esi
+
+    $ebx = MOV32rr $esi
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $esi = MOV32ri 1, debug-location !32
+    $ebx = MOV32ri 2
+
+  bb.3.if.end:
+    liveins: $ebx, $edi, $ebp
+
+    $esi = MOV32rr $ebx, debug-location !33
+    $edx = MOV32rr killed $ebp, debug-location !33
+    CALL64pcrel32 @fn1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit $edx, implicit-def $rsp, implicit-def $ssp, debug-location !33
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $eax = MOV32rr killed $ebx, debug-location !34
+    $rsp = frame-destroy ADD64ri8 $rsp, 8, implicit-def dead $eflags, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 24, debug-location !34
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 16, debug-location !34
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !34
+    RETQ killed $eax, debug-location !34
+
+...
diff --git a/llvm/test/DebugInfo/MIR/X86/kill-entry-value-after-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/kill-entry-value-after-diamond-bbs.mir
new file mode 100644
index 0000000000000..0109dc47ef36d
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/kill-entry-value-after-diamond-bbs.mir
@@ -0,0 +1,180 @@
+# RUN: llc -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s
+#
+# The test case was artificially adjusted, in order to make proper diamond basic
+# block structure relevant to the debug entry values clobbering.
+#
+# CHECK: ![[ARG_B:.*]] = !DILocalVariable(name: "b"
+# CHECK: bb.0.entry
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: bb.1.if.then
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: $ebx = MOV32rr $esi
+# CHECK-NEXT: DBG_VALUE $ebx, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK-NEXT: $esi = MOV32ri 5
+# CHECK-NEXT: $ebx = MOV32ri 1
+# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.2.if.else
+# CHECK: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK: $ebp = MOV32rr $esi
+# CHECK: DBG_VALUE $ebp, $noreg, ![[ARG_B]], !DIExpression()
+# CHECK-NEXT: $esi = MOV32ri 1
+# CHECK-NEXT: $ebp = MOV32ri 2
+# CHECK-NEXT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.3.if.end
+# CHECK-NOT: DBG_VALUE $esi, $noreg, ![[ARG_B]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+#
+--- |
+  ; ModuleID = 'test.c'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: noinline nounwind uwtable
+  define dso_local i32 @fn2(i32 %a, i32 %b, i32 %c) local_unnamed_addr !dbg !12 {
+  entry:
+    call void @llvm.dbg.value(metadata i32 %a, metadata !16, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %c, metadata !18, metadata !DIExpression()), !dbg !20
+    %add = add nsw i32 %a, 2, !dbg !21
+    call void @llvm.dbg.value(metadata i32 %add, metadata !19, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 6, i32 %add), !dbg !22
+    %cmp = icmp slt i32 %b, 17, !dbg !23
+    br i1 %cmp, label %if.then, label %if.else, !dbg !25
+
+  if.then:                                          ; preds = %entry
+    %add1 = add nsw i32 %b, 7, !dbg !26
+    call void @llvm.dbg.value(metadata i32 %add1, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 %add1, i32 %c), !dbg !28
+    br label %if.end, !dbg !29
+
+  if.else:                                          ; preds = %entry
+    %add2 = add nuw nsw i32 %b, 1, !dbg !30
+    call void @llvm.dbg.value(metadata i32 %add2, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 1, i32 %add2, i32 %c), !dbg !32
+    br label %if.end
+
+  if.end:                                           ; preds = %if.else, %if.then
+    %b.addr.0 = phi i32 [ %add1, %if.then ], [ %add2, %if.else ], !dbg !33
+    call void @llvm.dbg.value(metadata i32 %b.addr.0, metadata !17, metadata !DIExpression()), !dbg !20
+    ret i32 %b.addr.0, !dbg !34
+  }
+
+  declare !dbg !4 dso_local void @fn1(i32, i32, i32) local_unnamed_addr
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/dir")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 2, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{!"clang version 10.0.0"}
+  !12 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 5, type: !13, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!7, !7, !7, !7}
+  !15 = !{!16, !17, !18, !19}
+  !16 = !DILocalVariable(name: "a", arg: 1, scope: !12, file: !1, line: 5, type: !7)
+  !17 = !DILocalVariable(name: "b", arg: 2, scope: !12, file: !1, line: 5, type: !7)
+  !18 = !DILocalVariable(name: "c", arg: 3, scope: !12, file: !1, line: 5, type: !7)
+  !19 = !DILocalVariable(name: "q", scope: !12, file: !1, line: 7, type: !7)
+  !20 = !DILocation(line: 0, scope: !12)
+  !21 = !DILocation(line: 7, column: 15, scope: !12)
+  !22 = !DILocation(line: 9, column: 5, scope: !12)
+  !23 = !DILocation(line: 11, column: 11, scope: !24)
+  !24 = distinct !DILexicalBlock(scope: !12, file: !1, line: 11, column: 9)
+  !25 = !DILocation(line: 11, column: 9, scope: !12)
+  !26 = !DILocation(line: 12, column: 13, scope: !27)
+  !27 = distinct !DILexicalBlock(scope: !24, file: !1, line: 11, column: 17)
+  !28 = !DILocation(line: 13, column: 8, scope: !27)
+  !29 = !DILocation(line: 14, column: 5, scope: !27)
+  !30 = !DILocation(line: 15, column: 13, scope: !31)
+  !31 = distinct !DILexicalBlock(scope: !24, file: !1, line: 14, column: 12)
+  !32 = !DILocation(line: 16, column: 7, scope: !31)
+  !33 = !DILocation(line: 0, scope: !24)
+  !34 = !DILocation(line: 19, column: 5, scope: !12)
+
+...
+---
+name:            fn2
+alignment:       16
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -24, size: 8, alignment: 8, stack-id: default,
+      callee-saved-register: '$rbx', callee-saved-restored: true, debug-info-variable: '',
+      debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default,
+      callee-saved-register: '$rbp', callee-saved-restored: true, debug-info-variable: '',
+      debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi, $edx, $esi, $rbp, $rbx
+
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    DBG_VALUE $esi, $noreg, !17, !DIExpression(), debug-location !20
+    DBG_VALUE $edx, $noreg, !18, !DIExpression(), debug-location !20
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 24
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 32
+    CFI_INSTRUCTION offset $rbx, -24
+    CFI_INSTRUCTION offset $rbp, -16
+    $ebp = MOV32rr $edx
+    DBG_VALUE $ebp, $noreg, !18, !DIExpression(), debug-location !20
+    renamable $edi = KILL $edi, implicit-def $rdi
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    renamable $edx = LEA64_32r killed renamable $rdi, 1, $noreg, 2, $noreg, debug-location !21
+    DBG_VALUE $edx, $noreg, !19, !DIExpression(), debug-location !20
+    $edi = MOV32ri 5, debug-location !22
+    CMP32ri8 renamable $ebp, 16, implicit-def $eflags, debug-location !23
+    JCC_1 %bb.2, 15, implicit killed $eflags, debug-location !25
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx, $esi
+
+    $ebx = MOV32rr $esi
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $esi = MOV32ri 5, debug-location !28
+    $ebx = MOV32ri 1
+    JMP_1 %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx, $esi
+
+    $ebp = MOV32rr $esi
+    DBG_VALUE $ebp, $noreg, !17, !DIExpression(), debug-location !20
+    $esi = MOV32ri 1, debug-location !32
+    $ebp = MOV32ri 2
+
+  bb.3.if.end:
+    liveins: $ebx, $edi, $ebp
+
+    $esi = MOV32rr $ebx, debug-location !33
+    $edx = MOV32rr killed $ebp, debug-location !33
+    CALL64pcrel32 @fn1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit $edx, implicit-def $rsp, implicit-def $ssp, debug-location !33
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $eax = MOV32rr killed $ebx, debug-location !34
+    $rsp = frame-destroy ADD64ri8 $rsp, 8, implicit-def dead $eflags, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 24, debug-location !34
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 16, debug-location !34
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !34
+    RETQ killed $eax, debug-location !34
+
+...
+
diff --git a/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir
new file mode 100644
index 0000000000000..86b1cddaa462b
--- /dev/null
+++ b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir
@@ -0,0 +1,184 @@
+# RUN: llc -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s
+#
+#extern void fn1 (int, int, int);
+#__attribute__((noinline))
+#int
+#fn2 (int a, int b, int c) {
+#  int q = 2 + a;
+#  fn1 (5, 6, q);
+#  if (b < 17) {
+#    b = b + 7;
+#     fn1 (5, b, q);
+#  } else {
+#    b = b + 1;
+#    fn1 (1, b, q);
+#  }
+#  return b;
+#}
+# CHECK: ![[ARG_C:.*]] = !DILocalVariable(name: "c"
+# CHECK: bb.0.entry:
+# CHECK: DBG_VALUE $edx, $noreg, ![[ARG_C]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.1.if.then:
+# CHECK: DBG_VALUE $edx, $noreg, ![[ARG_C]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.2.if.else:
+# CHECK: DBG_VALUE $edx, $noreg, ![[ARG_C]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+# CHECK: bb.3.if.end:
+# CHECK: DBG_VALUE $edx, $noreg, ![[ARG_C]], !DIExpression(DW_OP_LLVM_entry_value, 1)
+#
+--- |
+  ; ModuleID = 'test.c'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+
+  ; Function Attrs: noinline nounwind uwtable
+  define dso_local i32 @fn2(i32 %a, i32 %b, i32 %c) local_unnamed_addr !dbg !12 {
+  entry:
+    call void @llvm.dbg.value(metadata i32 %a, metadata !16, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %b, metadata !17, metadata !DIExpression()), !dbg !20
+    call void @llvm.dbg.value(metadata i32 %c, metadata !18, metadata !DIExpression()), !dbg !20
+    %add = add nsw i32 %a, 2, !dbg !21
+    call void @llvm.dbg.value(metadata i32 %add, metadata !19, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 6, i32 %add), !dbg !22
+    %cmp = icmp slt i32 %b, 17, !dbg !23
+    br i1 %cmp, label %if.then, label %if.else, !dbg !25
+
+  if.then:                                          ; preds = %entry
+    %add1 = add nsw i32 %b, 7, !dbg !26
+    call void @llvm.dbg.value(metadata i32 %add1, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 5, i32 %add1, i32 %add), !dbg !28
+    br label %if.end, !dbg !29
+
+  if.else:                                          ; preds = %entry
+    %add2 = add nuw nsw i32 %b, 1, !dbg !30
+    call void @llvm.dbg.value(metadata i32 %add2, metadata !17, metadata !DIExpression()), !dbg !20
+    tail call void @fn1(i32 1, i32 %add2, i32 %add), !dbg !32
+    br label %if.end
+
+  if.end:                                           ; preds = %if.else, %if.then
+    %b.addr.0 = phi i32 [ %add1, %if.then ], [ %add2, %if.else ], !dbg !33
+    call void @llvm.dbg.value(metadata i32 %b.addr.0, metadata !17, metadata !DIExpression()), !dbg !20
+    ret i32 %b.addr.0, !dbg !34
+  }
+
+  declare !dbg !4 dso_local void @fn1(i32, i32, i32) local_unnamed_addr
+
+  ; Function Attrs: nounwind readnone speculatable willreturn
+  declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!8, !9, !10}
+  !llvm.ident = !{!11}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+  !1 = !DIFile(filename: "test.c", directory: "/")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !5, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+  !5 = !DISubroutineType(types: !6)
+  !6 = !{null, !7, !7, !7}
+  !7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !8 = !{i32 2, !"Dwarf Version", i32 4}
+  !9 = !{i32 2, !"Debug Info Version", i32 3}
+  !10 = !{i32 1, !"wchar_size", i32 4}
+  !11 = !{!"clang version 10.0.0"}
+  !12 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 5, type: !13, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !15)
+  !13 = !DISubroutineType(types: !14)
+  !14 = !{!7, !7, !7, !7}
+  !15 = !{!16, !17, !18, !19}
+  !16 = !DILocalVariable(name: "a", arg: 1, scope: !12, file: !1, line: 5, type: !7)
+  !17 = !DILocalVariable(name: "b", arg: 2, scope: !12, file: !1, line: 5, type: !7)
+  !18 = !DILocalVariable(name: "c", arg: 3, scope: !12, file: !1, line: 5, type: !7)
+  !19 = !DILocalVariable(name: "q", scope: !12, file: !1, line: 7, type: !7)
+  !20 = !DILocation(line: 0, scope: !12)
+  !21 = !DILocation(line: 7, column: 15, scope: !12)
+  !22 = !DILocation(line: 9, column: 5, scope: !12)
+  !23 = !DILocation(line: 11, column: 11, scope: !24)
+  !24 = distinct !DILexicalBlock(scope: !12, file: !1, line: 11, column: 9)
+  !25 = !DILocation(line: 11, column: 9, scope: !12)
+  !26 = !DILocation(line: 12, column: 13, scope: !27)
+  !27 = distinct !DILexicalBlock(scope: !24, file: !1, line: 11, column: 17)
+  !28 = !DILocation(line: 13, column: 8, scope: !27)
+  !29 = !DILocation(line: 14, column: 5, scope: !27)
+  !30 = !DILocation(line: 15, column: 13, scope: !31)
+  !31 = distinct !DILexicalBlock(scope: !24, file: !1, line: 14, column: 12)
+  !32 = !DILocation(line: 16, column: 7, scope: !31)
+  !33 = !DILocation(line: 0, scope: !24)
+  !34 = !DILocation(line: 19, column: 5, scope: !12)
+
+...
+---
+name:            fn2
+alignment:       16
+callSites:
+  - { bb: 0, offset: 20, fwdArgRegs:
+      - { arg: 0, reg: '$edi' }
+      - { arg: 1, reg: '$esi' }
+      - { arg: 2, reg: '$edx' } }
+  - { bb: 3, offset: 2, fwdArgRegs:
+      - { arg: 0, reg: '$edi' }
+      - { arg: 1, reg: '$esi' }
+      - { arg: 2, reg: '$edx' } }
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $edi, $esi, $rbp, $rbx
+
+    DBG_VALUE $edi, $noreg, !16, !DIExpression(), debug-location !20
+    DBG_VALUE $esi, $noreg, !17, !DIExpression(), debug-location !20
+    DBG_VALUE $edx, $noreg, !18, !DIExpression(), debug-location !20
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup PUSH64r killed $rbx, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 24
+    frame-setup PUSH64r undef $rax, implicit-def $rsp, implicit $rsp
+    CFI_INSTRUCTION def_cfa_offset 32
+    CFI_INSTRUCTION offset $rbx, -24
+    CFI_INSTRUCTION offset $rbp, -16
+    $ebx = MOV32rr $esi
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $ebp = MOV32rr $edi
+    DBG_VALUE $ebp, $noreg, !16, !DIExpression(), debug-location !20
+    renamable $ebp = nsw ADD32ri8 killed renamable $ebp, 2, implicit-def dead $eflags, debug-location !21
+    DBG_VALUE $ebp, $noreg, !19, !DIExpression(), debug-location !20
+    $edi = MOV32ri 5, debug-location !22
+    $esi = MOV32ri 6, debug-location !22
+    $edx = MOV32rr $ebp, debug-location !22
+    CALL64pcrel32 @fn1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit killed $edx, implicit-def $rsp, implicit-def $ssp, debug-location !22
+    CMP32ri8 renamable $ebx, 16, implicit-def $eflags, debug-location !23
+    JCC_1 %bb.2, 15, implicit killed $eflags, debug-location !25
+
+  bb.1.if.then:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx
+
+    renamable $ebx = nsw ADD32ri8 killed renamable $ebx, 7, implicit-def dead $eflags, debug-location !26
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $edi = MOV32ri 5, debug-location !28
+    JMP_1 %bb.3
+
+  bb.2.if.else:
+    successors: %bb.3(0x80000000)
+    liveins: $ebp, $ebx
+
+    renamable $ebx = nuw nsw ADD32ri8 killed renamable $ebx, 1, implicit-def dead $eflags, debug-location !30
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $edi = MOV32ri 1, debug-location !32
+
+  bb.3.if.end:
+    liveins: $ebx, $edi, $ebp
+
+    $esi = MOV32rr $ebx, debug-location !33
+    $edx = MOV32rr killed $ebp, debug-location !33
+    CALL64pcrel32 @fn1, csr_64, implicit $rsp, implicit $ssp, implicit $edi, implicit $esi, implicit killed $edx, implicit-def $rsp, implicit-def $ssp, debug-location !33
+    DBG_VALUE $ebx, $noreg, !17, !DIExpression(), debug-location !20
+    $eax = MOV32rr killed $ebx, debug-location !34
+    $rsp = frame-destroy ADD64ri8 $rsp, 8, implicit-def dead $eflags, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 24, debug-location !34
+    $rbx = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 16, debug-location !34
+    $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp, debug-location !34
+    CFI_INSTRUCTION def_cfa_offset 8, debug-location !34
+    RETQ killed $eax, debug-location !34
+
+...
diff --git a/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll b/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
index 24ba037762a9e..4a767aaf62355 100644
--- a/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
+++ b/llvm/test/DebugInfo/RISCV/relax-debug-frame.ll
@@ -1,19 +1,15 @@
-; RUN: llc -filetype=obj -mtriple=riscv32 -mattr=+relax %s -o - \
-; RUN:     | llvm-readobj -r | FileCheck -check-prefix=RELAX %s
-; RUN: llc -filetype=obj -mtriple=riscv32 -mattr=+relax %s -o - \
-; RUN:     | llvm-dwarfdump --debug-frame - 2>&1 \
+; RUN: llc -filetype=obj -mtriple=riscv32 -mattr=+relax %s -o %t.o
+; RUN: llvm-readobj -r %t.o | FileCheck -check-prefix=RELAX %s
+; RUN: llvm-dwarfdump --debug-frame %t.o 2>&1 \
 ; RUN:     | FileCheck -check-prefix=RELAX-DWARFDUMP %s
 ;
-; RELAX: Section{{.*}}.rela.{{eh|debug}}_frame {
-; RELAX-NOT: {{[}]}}
-; RELAX-NOT: 0x0 R_RISCV_ADD32
-; RELAX-NOT: 0x0 R_RISCV_SUB32
-; RELAX-NOT: {{[}]}}
-; RELAX: 0x20 R_RISCV_ADD32
-; RELAX: 0x20 R_RISCV_SUB32
-; RELAX-NOT: {{[}]}}
-; RELAX: 0x39 R_RISCV_SET6
-; RELAX: 0x39 R_RISCV_SUB6
+; RELAX:      Section ({{.*}}) .rela.eh_frame {
+; RELAX-NEXT:   0x1C R_RISCV_32_PCREL - 0x0
+; RELAX-NEXT:   0x20 R_RISCV_ADD32 - 0x0
+; RELAX-NEXT:   0x20 R_RISCV_SUB32 - 0x0
+; RELAX-NOT:  }
+; RELAX:        0x39 R_RISCV_SET6 - 0x0
+; RELAX-NEXT:   0x39 R_RISCV_SUB6 - 0x0
 ;
 ; RELAX-DWARFDUMP-NOT: error: failed to compute relocation
 ; RELAX-DWARFDUMP: CIE
diff --git a/llvm/test/DebugInfo/X86/debug-info-template-align.ll b/llvm/test/DebugInfo/X86/debug-info-template-align.ll
new file mode 100644
index 0000000000000..160d88cd0cc94
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-info-template-align.ll
@@ -0,0 +1,63 @@
+; RUN: llc %s -filetype=obj -o - | llvm-dwarfdump -v - | FileCheck %s
+
+; C++ source to regenerate:
+
+;typedef char  __attribute__((__aligned__(64))) alchar;
+
+;int main(){
+;    alchar newChar;
+;}
+; $ clang++ -O0 -g -gdwarf-5 debug-info-template-align.cpp -c
+
+; CHECK: .debug_abbrev contents:
+
+; CHECK: [5] DW_TAG_typedef  DW_CHILDREN_no
+; CHECK:  DW_AT_alignment DW_FORM_udata
+
+; CHECK: .debug_info contents:
+
+;CHECK: DW_TAG_typedef [5]
+;CHECK: DW_AT_name {{.*}} "alchar"
+;CHECK-NEXT: DW_AT_alignment [DW_FORM_udata] (64)
+
+
+; ModuleID = '/dir/test.cpp'
+source_filename = "/dir/test.cpp"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define dso_local i32 @main() #0 !dbg !7 {
+entry:
+  %newChar = alloca i8, align 64
+  call void @llvm.dbg.declare(metadata i8* %newChar, metadata !12, metadata !DIExpression()), !dbg !15
+  ret i32 0, !dbg !16
+}
+
+; Function Attrs: nounwind readnone speculatable willreturn
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline norecurse nounwind optnone uwtable }
+attributes #1 = { nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "/dir/test.cpp", directory: "/dir/", checksumkind: CSK_MD5, checksum: "872e252efdfcb9480b4bfaf8437f58ab")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 10.0.0 "}
+!7 = distinct !DISubprogram(name: "main", scope: !8, file: !8, line: 12, type: !9, scopeLine: 12, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DIFile(filename: "test.cpp", directory: "/dir", checksumkind: CSK_MD5, checksum: "872e252efdfcb9480b4bfaf8437f58ab")
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocalVariable(name: "newChar", scope: !7, file: !8, line: 13, type: !13)
+!13 = !DIDerivedType(tag: DW_TAG_typedef, name: "alchar", file: !8, line: 10, baseType: !14, align: 512)
+!14 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!15 = !DILocation(line: 13, column: 10, scope: !7)
+!16 = !DILocation(line: 14, column: 1, scope: !7)
diff --git a/llvm/test/DebugInfo/X86/debug_addr.ll b/llvm/test/DebugInfo/X86/debug_addr.ll
index e6dbe7d029d95..55e24b2764c18 100644
--- a/llvm/test/DebugInfo/X86/debug_addr.ll
+++ b/llvm/test/DebugInfo/X86/debug_addr.ll
@@ -28,7 +28,7 @@
 ; DWARF5: .debug_info contents:
 ; DWARF5: Compile Unit:{{.*}}version = 0x0005
 ; DWARF5-NOT: Compile Unit
-; DWARF5: DW_TAG_compile_unit
+; DWARF5: DW_TAG_skeleton_unit
 ; DWARF5-NOT: DW_TAG_{{.*}}
 ; DWARF5: DW_AT_GNU_dwo_name{{.*}}test.dwo
 ; DWARF5: DW_AT_addr_base{{.*}}0x00000008
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-debug-loc-simple.test b/llvm/test/DebugInfo/X86/dwarfdump-debug-loc-simple.test
index 15d688ea72d45..29d27982acce6 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-debug-loc-simple.test
+++ b/llvm/test/DebugInfo/X86/dwarfdump-debug-loc-simple.test
@@ -4,19 +4,19 @@ Note: the input file was generated from Inputs/dwarfdump-test-loc-list-32bit.elf
 CHECK: .debug_info
 CHECK: DW_AT_name{{.*}}"f"
 CHECK: DW_AT_location{{.*}}([[F_LOC:0x[0-9a-f]*]]
-CHECK-NEXT:    [0x00000000, 0x00000023): DW_OP_reg1 ECX
-CHECK-NEXT:    [0x00000023, 0x0000005d): DW_OP_breg5 EBP-16)
+CHECK-NEXT:    [0x00000000, 0x00000023) ".text": DW_OP_reg1 ECX
+CHECK-NEXT:    [0x00000023, 0x0000005d) ".text": DW_OP_breg5 EBP-16)
 CHECK: DW_AT_name{{.*}}"g"
 CHECK: DW_AT_location{{.*}}([[G_LOC:0x[0-9a-f]*]]
-CHECK-NEXT:    [0x00000000, 0x00000020): DW_OP_reg0 EAX
-CHECK-NEXT:    [0x00000020, 0x0000005d): DW_OP_breg5 EBP-12)
+CHECK-NEXT:    [0x00000000, 0x00000020) ".text": DW_OP_reg0 EAX
+CHECK-NEXT:    [0x00000020, 0x0000005d) ".text": DW_OP_breg5 EBP-12)
 
 CHECK: .debug_loc contents:
 CHECK-NEXT: [[F_LOC]]:
 this is actually the wrong location due to PR14763, but that doesn't matter for
 the purposes of testing dwarfdump
-CHECK-NEXT:             (0x00000000, 0x00000023): DW_OP_reg1 ECX
-CHECK-NEXT:             (0x00000023, 0x0000005d): DW_OP_breg5 EBP-16
+CHECK-NEXT:             (0x00000000, 0x00000023) ".text": DW_OP_reg1 ECX
+CHECK-NEXT:             (0x00000023, 0x0000005d) ".text": DW_OP_breg5 EBP-16
 CHECK: [[G_LOC]]:
-CHECK-NEXT:             (0x00000000, 0x00000020): DW_OP_reg0 EAX
-CHECK-NEXT:             (0x00000020, 0x0000005d): DW_OP_breg5 EBP-12
+CHECK-NEXT:             (0x00000000, 0x00000020) ".text": DW_OP_reg0 EAX
+CHECK-NEXT:             (0x00000020, 0x0000005d) ".text": DW_OP_breg5 EBP-12
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-rnglists-dwarf64.s b/llvm/test/DebugInfo/X86/dwarfdump-rnglists-dwarf64.s
index f8395818734ed..19bbd77586d8f 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-rnglists-dwarf64.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-rnglists-dwarf64.s
@@ -196,14 +196,14 @@ Range1_end:
 # CHECK-NOT:  Compile Unit:
 # CHECK:      DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]  (0x00000014)
-# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000020
+# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000034
 # CHECK-NEXT: [0x0000002a, 0x00000034) ".text")
 
 # CHECK:      .debug_info.dwo contents:
 # CHECK:      Compile Unit:
 # CHECK-NOT:  contents:
 # CHECK:      DW_TAG_compile_unit
-# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000011
+# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000025
 # CHECK-NEXT: [0x0000002a, 0x00000034))
 
 #ERR: error: parsing a range list table: did not detect a valid list table with base = 0x8
diff --git a/llvm/test/DebugInfo/X86/dwarfdump-rnglists.s b/llvm/test/DebugInfo/X86/dwarfdump-rnglists.s
index 7886374c4d637..0d6898df170bf 100644
--- a/llvm/test/DebugInfo/X86/dwarfdump-rnglists.s
+++ b/llvm/test/DebugInfo/X86/dwarfdump-rnglists.s
@@ -192,14 +192,14 @@ Range1_end:
 # CHECK-NOT:  Compile Unit:
 # CHECK:      DW_TAG_compile_unit
 # CHECK-NEXT: DW_AT_rnglists_base [DW_FORM_sec_offset]  (0x0000000c)
-# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000018
+# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000024
 # CHECK-NEXT: [0x0000002a, 0x00000034) ".text")
 
 # CHECK:      .debug_info.dwo contents:
 # CHECK:      Compile Unit:
 # CHECK-NOT:  contents:
 # CHECK:      DW_TAG_compile_unit
-# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000009
+# CHECK-NEXT: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x1) rangelist = 0x00000015
 # CHECK-NEXT: [0x0000002a, 0x00000034))
 
 #ERR: error: parsing a range list table: did not detect a valid list table with base = 0x8
diff --git a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
index 74e94643b9c08..1761c4aa8fe4b 100644
--- a/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
+++ b/llvm/test/DebugInfo/X86/split-dwarf-v5-ranges.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: .debug_info contents:
 ; CHECK: .debug_info.dwo contents:
-; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000004
+; CHECK: DW_AT_ranges [DW_FORM_rnglistx] (indexed (0x0) rangelist = 0x00000010
 ; CHECK:          [0x0000000000000001, 0x000000000000000c) ".text"
 ; CHECK:          [0x000000000000000e, 0x0000000000000013) ".text")
 
diff --git a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll
index e39f70c0e3099..ab54930fe916a 100644
--- a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll
+++ b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll
@@ -12,11 +12,11 @@
 ; in different order.
 
 ; CHECK: .debug_info contents:
-; CHECK:   DW_TAG_compile_unit
+; CHECK:   DW_TAG_skeleton_unit
 ; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] (indexed (00000000) string = "X3")
-; CHECK:   DW_TAG_compile_unit
+; CHECK:   DW_TAG_skeleton_unit
 ; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] (indexed (00000001) string = "X2")
-; CHECK:   DW_TAG_compile_unit
+; CHECK:   DW_TAG_skeleton_unit
 ; CHECK:     DW_AT_comp_dir [DW_FORM_strx1] (indexed (00000002) string = "X1")
 ; CHECK: .debug_info.dwo contents:
 
diff --git a/llvm/test/DebugInfo/X86/string-offsets-table.ll b/llvm/test/DebugInfo/X86/string-offsets-table.ll
index 9960fd833ed98..be960d1e017c8 100644
--- a/llvm/test/DebugInfo/X86/string-offsets-table.ll
+++ b/llvm/test/DebugInfo/X86/string-offsets-table.ll
@@ -56,7 +56,7 @@
 ; SPLIT:      .debug_info contents:
 ; SPLIT-NEXT: 0x00000000: Compile Unit:{{.*}}DW_UT_skeleton
 ; SPLIT-NOT:  contents:
-; SPLIT:      DW_TAG_compile_unit
+; SPLIT:      DW_TAG_skeleton_unit
 ; SPLIT-NOT:  {{DW_TAG|contents:}}
 ; SPLIT:      DW_AT_str_offsets_base [DW_FORM_sec_offset] (0x00000008)
 ; SPLIT:      DW_AT_comp_dir [DW_FORM_strx1] (indexed (00000000) string = "/home/test")
diff --git a/llvm/test/DebugInfo/debugmacinfo-dwo.test b/llvm/test/DebugInfo/debugmacinfo-dwo.test
new file mode 100644
index 0000000000000..7c5f7ef56531d
--- /dev/null
+++ b/llvm/test/DebugInfo/debugmacinfo-dwo.test
@@ -0,0 +1,20 @@
+RUN: llvm-dwarfdump -debug-macro %p/Inputs/dwarfdump-macro.dwo \
+RUN:   | FileCheck %s -check-prefix TEST_MACINFODWO
+
+; This test verifies that llvm-dwarfdump tools know how to read .debug_macinfo.dwo
+; section.
+; dwarfdump-macro.dwo has been generated from Inputs/dwarfdump-macro.cc
+; clang++ -c -O0 -DM3=Value3 -include dwarfdump-macro-cmd.h dwarfdump-macro.cc -fdebug-macro -gsplit-dwarf
+
+TEST_MACINFODWO: .debug_macinfo.dwo contents:
+TEST_MACINFODWO: DW_MACINFO_start_file - lineno: 0 filenum: 1
+TEST_MACINFODWO:   DW_MACINFO_start_file - lineno: 0 filenum: 2
+TEST_MACINFODWO:     DW_MACINFO_define - lineno: 1 macro: M4 Value4
+TEST_MACINFODWO:   DW_MACINFO_end_file
+TEST_MACINFODWO:   DW_MACINFO_define - lineno: 1 macro: M1 Value1
+TEST_MACINFODWO:   DW_MACINFO_start_file - lineno: 2 filenum: 3
+TEST_MACINFODWO:     DW_MACINFO_undef - lineno: 4 macro: M1
+TEST_MACINFODWO:     DW_MACINFO_define - lineno: 5 macro: M1 NewValue1
+TEST_MACINFODWO:   DW_MACINFO_end_file
+TEST_MACINFODWO:   DW_MACINFO_define - lineno: 3 macro: M2(x,y) ((x)+(y)* Value2)
+TEST_MACINFODWO: DW_MACINFO_end_file
diff --git a/llvm/test/DebugInfo/symbolize-build-id.test b/llvm/test/DebugInfo/symbolize-build-id.test
new file mode 100644
index 0000000000000..40221ae9e0574
--- /dev/null
+++ b/llvm/test/DebugInfo/symbolize-build-id.test
@@ -0,0 +1,28 @@
+# RUN: yaml2obj %s -o %t
+
+# RUN: llvm-symbolizer --debug-file-directory=/non-existent --obj=%t 0x20112f | FileCheck --check-prefix=UNKNOWN %s
+
+# UNKNOWN:      ??
+# UNKNOWN-NEXT: ??:0:0
+
+# RUN: llvm-symbolizer --debug-file-directory=%p/Inputs --obj=%t 0x20112f | FileCheck --check-prefix=FOUND %s
+
+# FOUND:      main
+# FOUND-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .note.gnu.build-id
+    Type:    SHT_NOTE
+    Flags:   [ SHF_ALLOC ]
+    Content: 040000000800000003000000474e5500abb50d82b6bdc861
+ProgramHeaders:
+  - Type: PT_NOTE
+    Flags: [ PF_R ]
+    Sections:
+      - Section: .note.gnu.build-id
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg-blockaddress.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg-blockaddress.ll
deleted file mode 100644
index faf60f3acdb30..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg-blockaddress.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 -S < %s | FileCheck %s
-
-define i8* @simp1(i32 %x) {
-; CHECK-LABEL: @simp1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 42
-; CHECK-NEXT:    [[ADDR:%.*]] = select i1 [[CMP]], i8* inttoptr (i32 1 to i8*), i8* inttoptr (i32 1 to i8*)
-; CHECK-NEXT:    ret i8* [[ADDR]]
-;
-entry:
-  %cmp = icmp slt i32 %x, 42
-  %addr = select i1 %cmp, i8* blockaddress(@simp1, %bb1), i8* blockaddress(@simp1, %bb2)
-  ret i8* %addr
-
-bb1:
-  ret i8* null
-
-bb2:
-  ret i8* null
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg1.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg1.ll
deleted file mode 100644
index cb0f82e37573a..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg1.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 -S < %s | FileCheck %s
-
-define i32 @simp1() {
-; CHECK-LABEL: @simp1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 10
-;
-entry:
-  br i1 true, label %if.then, label %if.else
-
-if.then:
-  ret i32 10
-
-if.else:
-  ret i32 12
-}
-
-define i32 @simp2() {
-; CHECK-LABEL: @simp2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 200
-;
-entry:
-  br i1 false, label %if.then, label %if.else
-
-if.then:
-  ret i32 99
-
-if.else:
-  ret i32 200
-}
-
-declare void @foo(i64)
-
-define i64 @merge_into_predecessor(i64 %a, i64 %b) {
-; CHECK-LABEL: @merge_into_predecessor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = add i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    call void @foo(i64 [[R]])
-; CHECK-NEXT:    call void @foo(i64 [[A]])
-; CHECK-NEXT:    ret i64 [[R]]
-;
-entry:
-  br label %bb.next
-
-bb.next:
-  %r = add i64 %a, %b
-  call void @foo(i64 %r)
-  call void @foo(i64 %a)
-  br label %bb.next.next
-
-bb.next.next:
-  ret i64 %r
-}
-
-define i64 @merge_into_predecessor_with_phi(i64 %a, i64 %b, i1 %c) {
-; CHECK-LABEL: @merge_into_predecessor_with_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @foo(i64 [[B:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = add i64 [[A:%.*]], [[B]]
-; CHECK-NEXT:    call void @foo(i64 [[R]])
-; CHECK-NEXT:    call void @foo(i64 [[A]])
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB_NEXT_NEXT:%.*]], label [[BB_EXIT:%.*]]
-; CHECK:       bb.next.next:
-; CHECK-NEXT:    br label [[BB_EXIT]]
-; CHECK:       bb.exit:
-; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ [[R]], [[ENTRY:%.*]] ], [ 10, [[BB_NEXT_NEXT]] ]
-; CHECK-NEXT:    ret i64 [[RET]]
-;
-entry:
-  call void @foo(i64 %b)
-  br label %bb.next
-
-bb.next:
-  %r = add i64 %a, %b
-  call void @foo(i64 %r)
-  call void @foo(i64 %a)
-  br i1 %c, label %bb.next.next, label %bb.exit
-
-bb.next.next:
-  br label %bb.exit
-
-bb.exit:
-  %ret = phi i64 [ %r, %bb.next], [ 10, %bb.next.next]
-  ret i64 %ret
-
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg2-dead-block-order.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg2-dead-block-order.ll
deleted file mode 100644
index 11b70fa526770..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg2-dead-block-order.ll
+++ /dev/null
@@ -1,109 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 -S < %s | FileCheck %s
-
-define i32 @remove_dead_blocks() {
-; CHECK-LABEL: @remove_dead_blocks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-; CHECK-NEXT:  }
-;
-entry:
-  ret i32 1
-
-bb.1:
-  ret i32 2
-
-bb.2:
-  ret i32 3
-}
-
-define i32 @simp1() {
-; CHECK-LABEL: @simp1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-; CHECK:       bb.1:
-; CHECK-NEXT:    ret i32 2
-; CHECK-NEXT:  }
-;
-entry:
-  ret i32 1
-
-bb.1:
-  ret i32 2
-
-bb.2:
-  br i1 undef, label %bb.1, label %bb.3
-
-bb.3:
-  ret i32 3
-}
-
-define i32 @remove_dead_block_with_phi() {
-; CHECK-LABEL: @remove_dead_block_with_phi(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BB_2:%.*]]
-; CHECK:       bb.2:
-; CHECK-NEXT:    ret i32 1
-; CHECK-NEXT:  }
-;
-entry:
-  br label %bb.2
-
-bb.1:
-  br label %bb.2
-
-bb.2:
-  %rv = phi i32 [ 1, %entry ], [ 2, %bb.1 ]
-  ret i32 %rv
-}
-
-define i32 @remove_dead_blocks_remaining_uses(i32 %a) {
-; CHECK-LABEL: @remove_dead_blocks_remaining_uses(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-; CHECK-NEXT:  }
-;
-entry:
-  ret i32 1
-
-bb.2:
-  ret i32 %res
-
-bb.1:
-  %res = add i32 %a, 10
-  br label %bb.2
-}
-
-define i32 @remove_dead_blocks_remaining_uses2(i32 %a, i1 %cond) {
-; CHECK-LABEL: @remove_dead_blocks_remaining_uses2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-; CHECK:       bb.2:
-; CHECK-NEXT:    [[RES2:%.*]] = add i32 undef, 10
-; CHECK-NEXT:    [[RES3:%.*]] = mul i32 [[RES2]], undef
-; CHECK-NEXT:    ret i32 [[RES3]]
-; CHECK:       bb.3:
-; CHECK-NEXT:    ret i32 undef
-; CHECK-NEXT:  }
-;
-entry:
-  ret i32 1
-
-bb.2:
-  %res2 = add i32 %res, 10
-  %res3 = mul i32 %res2, %res
-  ret i32 %res3
-
-bb.3:
-  br label %bb.4
-
-bb.4:
-  ret i32 %res
-
-bb.1:
-  %res = add i32 %a, 10
-  br i1 %cond, label %bb.2, label %bb.3
-  br label %bb.2
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg3-phis.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg3-phis.ll
deleted file mode 100644
index 76db503faeb25..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg3-phis.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 -S < %s | FileCheck %s
-
-define i32 @phi_cond_branch_eliminated() {
-; CHECK-LABEL: @phi_cond_branch_eliminated(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 20
-;
-entry:
-  br i1 true, label %bb.2, label %bb.3
-
-bb.2:
-  br label %bb.3
-
-bb.3:
-  %ret = phi i32 [ 10, %entry ], [ 20, %bb.2 ]
-  ret i32 %ret
-}
-
-define i32 @phi_removed() {
-; CHECK-LABEL: @phi_removed(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[BB_3:%.*]]
-; CHECK:       bb.3:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  br i1 false, label %bb.2, label %bb.3
-
-bb.2:
-  %pv = phi i32 [ 10, %entry ]
-  br label %bb.3
-
-bb.3:
-  ret i32 0
-}
-
-define i32 @phi_in_dead_region() {
-; CHECK-LABEL: @phi_in_dead_region(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-;
-entry:
-  ret i32 1
-
-bb.1:
-  br i1 true, label %bb.2, label %bb.3
-
-bb.2:
-  br label %bb.3
-
-bb.3:
-  %ret = phi i32 [ 10, %bb.1 ], [ 20, %bb.2 ]
-  ret i32 %ret
-}
-
-define i32 @phi_in_mergable_blocks() {
-; CHECK-LABEL: @phi_in_mergable_blocks(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 10
-;
-entry:
-  br label %bb.1
-
-bb.1:
-  %pv = phi i32 [ 10, %entry ]
-  ret i32 %pv
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg4-multiple-duplicate-cfg-updates.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg4-multiple-duplicate-cfg-updates.ll
deleted file mode 100644
index 82a0e0dac2369..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg4-multiple-duplicate-cfg-updates.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 < %s -S -verify-dom-info | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 < %s -S -verify-dom-info | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 < %s -S -verify-dom-info | FileCheck %s
-
-; Check that we do not crash when we remove edges multiple times in
-; the DomTreeUpdater.
-define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    switch i8 undef, label [[IF_THEN_EPIL:%.*]] [
-; CHECK-NEXT:    i8 32, label [[FOR_INC_EPIL:%.*]]
-; CHECK-NEXT:    i8 46, label [[FOR_INC_EPIL]]
-; CHECK-NEXT:    i8 95, label [[FOR_INC_EPIL]]
-; CHECK-NEXT:    i8 45, label [[FOR_INC_EPIL]]
-; CHECK-NEXT:    i8 126, label [[FOR_INC_EPIL]]
-; CHECK-NEXT:    ]
-; CHECK:       if.then.epil:
-; CHECK-NEXT:    unreachable
-; CHECK:       for.inc.epil:
-; CHECK-NEXT:    ret void
-;
-entry:
-  br label %for.body.epil
-
-for.body.epil:                                    ; preds = %entry
-  switch i8 undef, label %if.then.epil [
-  i8 32, label %for.inc.epil
-  i8 46, label %for.inc.epil
-  i8 95, label %for.inc.epil
-  i8 45, label %for.inc.epil
-  i8 126, label %for.inc.epil
-  ]
-
-if.then.epil:                                     ; preds = %for.body.epil
-  unreachable
-
-for.inc.epil:                                     ; preds = %for.body.epil, %for.body.epil, %for.body.epil, %for.body.epil, %for.body.epil
-  ret void
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg5-del-phis-for-dead-block.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg5-del-phis-for-dead-block.ll
deleted file mode 100644
index b3edd1aa50584..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg5-del-phis-for-dead-block.ll
+++ /dev/null
@@ -1,122 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 < %s -S -verify-dom-info | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 < %s -S -verify-dom-info | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 < %s -S -verify-dom-info | FileCheck %s
-
-define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT23:%.*]] [
-; CHECK-NEXT:    i32 129, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 215, label [[SW_BB1:%.*]]
-; CHECK-NEXT:    i32 117, label [[SW_BB1]]
-; CHECK-NEXT:    i32 207, label [[SW_BB1]]
-; CHECK-NEXT:    i32 158, label [[SW_BB1]]
-; CHECK-NEXT:    i32 94, label [[SW_BB1]]
-; CHECK-NEXT:    i32 219, label [[SW_BB1]]
-; CHECK-NEXT:    i32 88, label [[SW_BB1]]
-; CHECK-NEXT:    i32 168, label [[SW_BB1]]
-; CHECK-NEXT:    i32 295, label [[SW_BB1]]
-; CHECK-NEXT:    i32 294, label [[SW_BB1]]
-; CHECK-NEXT:    i32 296, label [[SW_BB1]]
-; CHECK-NEXT:    i32 67, label [[SW_BB1]]
-; CHECK-NEXT:    i32 293, label [[SW_BB1]]
-; CHECK-NEXT:    i32 382, label [[SW_BB1]]
-; CHECK-NEXT:    i32 335, label [[SW_BB1]]
-; CHECK-NEXT:    i32 393, label [[SW_BB1]]
-; CHECK-NEXT:    i32 415, label [[SW_BB1]]
-; CHECK-NEXT:    i32 400, label [[SW_BB1]]
-; CHECK-NEXT:    i32 383, label [[SW_BB1]]
-; CHECK-NEXT:    i32 421, label [[SW_BB1]]
-; CHECK-NEXT:    i32 422, label [[SW_BB1]]
-; CHECK-NEXT:    i32 302, label [[SW_BB1]]
-; CHECK-NEXT:    i32 303, label [[SW_BB1]]
-; CHECK-NEXT:    i32 304, label [[SW_BB1]]
-; CHECK-NEXT:    i32 420, label [[SW_BB1]]
-; CHECK-NEXT:    i32 401, label [[SW_EPILOG24:%.*]]
-; CHECK-NEXT:    i32 53, label [[SW_BB12:%.*]]
-; CHECK-NEXT:    i32 44, label [[SW_BB12]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.bb1:
-; CHECK-NEXT:    br label [[SW_EPILOG24]]
-; CHECK:       sw.bb12:
-; CHECK-NEXT:    switch i32 undef, label [[SW_DEFAULT:%.*]] [
-; CHECK-NEXT:    i32 47, label [[SW_BB13:%.*]]
-; CHECK-NEXT:    i32 8, label [[SW_BB13]]
-; CHECK-NEXT:    ]
-; CHECK:       sw.bb13:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.default:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.default23:
-; CHECK-NEXT:    unreachable
-; CHECK:       sw.epilog24:
-; CHECK-NEXT:    [[PREVIOUS_3:%.*]] = phi i32 [ undef, [[SW_BB1]] ], [ 401, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    unreachable
-;
-entry:
-  br label %while.body
-
-while.body:                                       ; preds = %entry
-  switch i32 undef, label %sw.default23 [
-  i32 129, label %sw.bb
-  i32 215, label %sw.bb1
-  i32 117, label %sw.bb1
-  i32 207, label %sw.bb1
-  i32 158, label %sw.bb1
-  i32 94, label %sw.bb1
-  i32 219, label %sw.bb1
-  i32 88, label %sw.bb1
-  i32 168, label %sw.bb1
-  i32 295, label %sw.bb1
-  i32 294, label %sw.bb1
-  i32 296, label %sw.bb1
-  i32 67, label %sw.bb1
-  i32 293, label %sw.bb1
-  i32 382, label %sw.bb1
-  i32 335, label %sw.bb1
-  i32 393, label %sw.bb1
-  i32 415, label %sw.bb1
-  i32 400, label %sw.bb1
-  i32 383, label %sw.bb1
-  i32 421, label %sw.bb1
-  i32 422, label %sw.bb1
-  i32 302, label %sw.bb1
-  i32 303, label %sw.bb1
-  i32 304, label %sw.bb1
-  i32 420, label %sw.bb1
-  i32 401, label %sw.epilog24
-  i32 53, label %sw.bb12
-  i32 44, label %sw.bb12
-  ]
-
-sw.bb:                                            ; preds = %while.body
-  unreachable
-
-sw.bb1:                                           ; preds = %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body, %while.body
-  br i1 false, label %land.lhs.true, label %sw.epilog24
-
-land.lhs.true:                                    ; preds = %sw.bb1
-  br label %sw.epilog24
-
-sw.bb12:                                          ; preds = %while.body, %while.body
-  switch i32 undef, label %sw.default [
-  i32 47, label %sw.bb13
-  i32 8, label %sw.bb13
-  ]
-
-sw.bb13:                                          ; preds = %sw.bb12, %sw.bb12
-  unreachable
-
-sw.default:                                       ; preds = %sw.bb12
-  unreachable
-
-sw.default23:                                     ; preds = %while.body
-  unreachable
-
-sw.epilog24:                                      ; preds = %land.lhs.true, %sw.bb1, %while.body
-  %Previous.3 = phi i32 [ undef, %land.lhs.true ], [ undef, %sw.bb1 ], [ 401, %while.body ]
-  unreachable
-}
diff --git a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg6-dead-self-loop.ll b/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg6-dead-self-loop.ll
deleted file mode 100644
index f9705a6948b21..0000000000000
--- a/llvm/test/Examples/IRTransforms/SimplifyCFG/tut-simplify-cfg6-dead-self-loop.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v1 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v2 -S < %s | FileCheck %s
-; RUN: opt -tut-simplifycfg -tut-simplifycfg-version=v3 -S < %s | FileCheck %s
-
-define i32 @simp1() {
-; CHECK-LABEL: @simp1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret i32 1
-; CHECK:       bb.1:
-; CHECK-NEXT:    br label [[BB_1:%.*]]
-; CHECK:       bb.2:
-; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[BB_2:%.*]] ]
-; CHECK-NEXT:    br label [[BB_2]]
-;
-entry:
-  ret i32 1
-
-bb.1:
-  br label %bb.1
-
-bb.2:
-  %p = phi i32 [ 0, %bb.2]
-  br label %bb.2
-}
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/MachO_weak_references.s b/llvm/test/ExecutionEngine/JITLink/X86/MachO_weak_references.s
new file mode 100644
index 0000000000000..20fa5536302d7
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/X86/MachO_weak_references.s
@@ -0,0 +1,19 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=x86_64-apple-macosx10.9 -filetype=obj -o %t/macho_weak_refs.o %s
+# RUN: llvm-jitlink -noexec -check-name=jitlink-check-bar-present -define-abs bar=0x1 -check=%s %t/macho_weak_refs.o
+# RUN: llvm-jitlink -noexec -check-name=jitlink-check-bar-absent -check=%s %t/macho_weak_refs.o
+
+# Test weak reference handling by linking with and without a definition of 'bar' available.
+
+	.section	__TEXT,__text,regular,pure_instructions
+	.build_version macos, 10, 14	sdk_version 10, 14
+	.globl	_main
+	.p2align	4, 0x90
+_main:
+# jitlink-check-bar-present: *{8}(got_addr(macho_weak_refs.o, bar)) = bar
+# jitlink-check-bar-absent: *{8}(got_addr(macho_weak_refs.o, bar)) = 0
+	cmpq	$0, bar@GOTPCREL(%rip)
+
+	.weak_reference bar
+
+.subsections_via_symbols
diff --git a/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s b/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
index c97b1ecce6d6d..5fabc6db1218b 100644
--- a/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
+++ b/llvm/test/ExecutionEngine/JITLink/X86/MachO_x86-64_relocations.s
@@ -40,6 +40,17 @@ test_gotld:
         movq    external_data@GOTPCREL(%rip), %rax
         retq
 
+
+# Check X86_64_RELOC_GOTPCREL handling with cmp instructions, which have
+# negative addends.
+#
+# jitlink-check: decode_operand(test_gotcmpq, 3) = got_addr(macho_reloc.o, external_data) - next_pc(test_gotcmpq)
+        .globl  test_gotcmpq
+        .align  4, 0x90
+test_gotcmpq:
+        cmpq    $0, external_data@GOTPCREL(%rip)
+        retq
+
 # Check that calls to external functions trigger the generation of stubs and GOT
 # entries.
 #
@@ -118,16 +129,16 @@ Lanon_data:
 # anonymous.
 #
 # Note: +8 offset in expression below to accounts for sizeof(Lanon_data).
-# jitlink-check: *{8}(section_addr(macho_reloc.o, __data) + 8) = (section_addr(macho_reloc.o, __data) + 8) - named_data + 2
+# jitlink-check: *{8}(section_addr(macho_reloc.o, __data) + 8) = (section_addr(macho_reloc.o, __data) + 8) - named_data - 2
         .p2align  3
 Lanon_minuend_quad:
-        .quad Lanon_minuend_quad - named_data + 2
+        .quad Lanon_minuend_quad - named_data - 2
 
 # Note: +16 offset in expression below to accounts for sizeof(Lanon_data) + sizeof(Lanon_minuend_long).
-# jitlink-check: *{4}(section_addr(macho_reloc.o, __data) + 16) = ((section_addr(macho_reloc.o, __data) + 16) - named_data + 2)[31:0]
+# jitlink-check: *{4}(section_addr(macho_reloc.o, __data) + 16) = ((section_addr(macho_reloc.o, __data) + 16) - named_data - 2)[31:0]
         .p2align  2
 Lanon_minuend_long:
-        .long Lanon_minuend_long - named_data + 2
+        .long Lanon_minuend_long - named_data - 2
 
 # Named quad storage target (first named atom in __data).
         .globl named_data
@@ -221,11 +232,11 @@ minuend_long3:
 # (i.e. is part of an alt_entry chain that includes 'A').
 #
 # Check "A: .long B - C + D" where 'B' is an alt_entry for 'A'.
-# jitlink-check: *{4}subtractor_with_alt_entry_minuend_long = (subtractor_with_alt_entry_minuend_long_B - named_data + 2)[31:0]
+# jitlink-check: *{4}subtractor_with_alt_entry_minuend_long = (subtractor_with_alt_entry_minuend_long_B - named_data - 2)[31:0]
         .globl  subtractor_with_alt_entry_minuend_long
         .p2align  2
 subtractor_with_alt_entry_minuend_long:
-        .long subtractor_with_alt_entry_minuend_long_B - named_data + 2
+        .long subtractor_with_alt_entry_minuend_long_B - named_data - 2
 
         .globl  subtractor_with_alt_entry_minuend_long_B
         .p2align  2
@@ -234,11 +245,11 @@ subtractor_with_alt_entry_minuend_long_B:
         .long 0
 
 # Check "A: .quad B - C + D" where 'B' is an alt_entry for 'A'.
-# jitlink-check: *{8}subtractor_with_alt_entry_minuend_quad = (subtractor_with_alt_entry_minuend_quad_B - named_data + 2)
+# jitlink-check: *{8}subtractor_with_alt_entry_minuend_quad = (subtractor_with_alt_entry_minuend_quad_B - named_data - 2)
         .globl  subtractor_with_alt_entry_minuend_quad
         .p2align  3
 subtractor_with_alt_entry_minuend_quad:
-        .quad subtractor_with_alt_entry_minuend_quad_B - named_data + 2
+        .quad subtractor_with_alt_entry_minuend_quad_B - named_data - 2
 
         .globl  subtractor_with_alt_entry_minuend_quad_B
         .p2align  3
@@ -247,11 +258,11 @@ subtractor_with_alt_entry_minuend_quad_B:
         .quad 0
 
 # Check "A: .long B - C + D" where 'C' is an alt_entry for 'A'.
-# jitlink-check: *{4}subtractor_with_alt_entry_subtrahend_long = (named_data - subtractor_with_alt_entry_subtrahend_long_B + 2)[31:0]
+# jitlink-check: *{4}subtractor_with_alt_entry_subtrahend_long = (named_data - subtractor_with_alt_entry_subtrahend_long_B - 2)[31:0]
         .globl  subtractor_with_alt_entry_subtrahend_long
         .p2align  2
 subtractor_with_alt_entry_subtrahend_long:
-        .long named_data - subtractor_with_alt_entry_subtrahend_long_B + 2
+        .long named_data - subtractor_with_alt_entry_subtrahend_long_B - 2
 
         .globl  subtractor_with_alt_entry_subtrahend_long_B
         .p2align  2
@@ -260,11 +271,11 @@ subtractor_with_alt_entry_subtrahend_long_B:
         .long 0
 
 # Check "A: .quad B - C + D" where 'B' is an alt_entry for 'A'.
-# jitlink-check: *{8}subtractor_with_alt_entry_subtrahend_quad = (named_data - subtractor_with_alt_entry_subtrahend_quad_B + 2)
+# jitlink-check: *{8}subtractor_with_alt_entry_subtrahend_quad = (named_data - subtractor_with_alt_entry_subtrahend_quad_B - 2)
         .globl  subtractor_with_alt_entry_subtrahend_quad
         .p2align  3
 subtractor_with_alt_entry_subtrahend_quad:
-        .quad named_data - subtractor_with_alt_entry_subtrahend_quad_B + 2
+        .quad named_data - subtractor_with_alt_entry_subtrahend_quad_B - 2
 
         .globl  subtractor_with_alt_entry_subtrahend_quad_B
         .p2align  3
diff --git a/llvm/test/Feature/reserve_global_reg.ll b/llvm/test/Feature/reserve_global_reg.ll
deleted file mode 100644
index 405f3eea00a5e..0000000000000
--- a/llvm/test/Feature/reserve_global_reg.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; REQUIRES: arm
-; RUN: not llc < %s -mtriple=thumbv7-apple-darwin -mattr=+reserve-r7 -o - 2>&1 | FileCheck -check-prefix=CHECK-RESERVE-FP7 %s
-; RUN: not llc < %s -mtriple=armv7-windows-msvc -mattr=+reserve-r11 -o - 2>&1 | FileCheck -check-prefix=CHECK-RESERVE-FP11 %s
-; RUN: not llc < %s -mtriple=thumbv7-windows -mattr=+reserve-r11 -o - 2>&1 | FileCheck -check-prefix=CHECK-RESERVE-FP11-2 %s
-
-; int test(int a, int b, int c) {
-;   return a + b + c;
-; }
-
-; Function Attrs: noinline nounwind optnone
-define hidden i32 @_Z4testiii(i32 %a, i32 %b, i32 %c) #0 {
-entry:
-  %a.addr = alloca i32, align 4
-  %b.addr = alloca i32, align 4
-  %c.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  store i32 %b, i32* %b.addr, align 4
-  store i32 %c, i32* %c.addr, align 4
-  %0 = load i32, i32* %a.addr, align 4
-  %1 = load i32, i32* %b.addr, align 4
-  %add = add nsw i32 %0, %1
-  %2 = load i32, i32* %c.addr, align 4
-  %add1 = add nsw i32 %add, %2
-  ret i32 %add1
-}
-
-; CHECK-RESERVE-FP7: Register r7 has been specified but is used as the frame pointer for this target.
-; CHECK-RESERVE-FP11: Register r11 has been specified but is used as the frame pointer for this target.
-; CHECK-RESERVE-FP11-2: Register r11 has been specified but is used as the frame pointer for this target.
-
diff --git a/llvm/test/FileCheck/dump-input-enable.txt b/llvm/test/FileCheck/dump-input-enable.txt
index 511248ea1ac7d..cf47f03dfa835 100644
--- a/llvm/test/FileCheck/dump-input-enable.txt
+++ b/llvm/test/FileCheck/dump-input-enable.txt
@@ -42,33 +42,32 @@ BADVAL: {{F|f}}ile{{C|c}}heck{{.*}}: for the --dump-input option: Cannot find op
 ; RUN: %ProtectFileCheckOutput FileCheck -dump-input=help \
 ; RUN: | FileCheck %s -check-prefix=HELP
 
-HELP-NOT: {{.}}
-HELP: The following description was requested by -dump-input=help
-HELP: try{{.*}}-color
-HELP-NOT: {{.}}
-
 ;--------------------------------------------------
 ; Check -dump-input=never.
 ;
 ; Include the case without -v, which isn't covered elsewhere.
 ;--------------------------------------------------
 
+; FileCheck success, no -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input=never 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -allow-empty \
 ; RUN:             -check-prefixes=NOTRACE,NODUMP
 
+; FileCheck fail, no -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input=never 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,ERR,NODUMP
 
+; FileCheck success, -v => no dump, trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input=never -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -check-prefixes=TRACE,NODUMP
 
+; FileCheck fail, -v => no dump, trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input=never -v 2>&1 \
@@ -78,11 +77,13 @@ HELP-NOT: {{.}}
 ; Check no -dump-input, which defaults to never.
 ;--------------------------------------------------
 
+; FileCheck success, -v => no dump, trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -check-prefixes=TRACE,NODUMP
 
+; FileCheck fail, -v => no dump, trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -v 2>&1 \
@@ -94,23 +95,27 @@ HELP-NOT: {{.}}
 ; Include the case without -v, which isn't covered elsewhere.
 ;--------------------------------------------------
 
+; FileCheck success, no -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input=fail 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -allow-empty \
 ; RUN:             -check-prefixes=NOTRACE,NODUMP
 
+; FileCheck fail, no -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input=fail 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,ERR,DUMP-ERR
 
+; FileCheck success, -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input=fail -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -allow-empty \
 ; RUN:             -check-prefixes=NOTRACE,NODUMP
 
+; FileCheck fail, -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input=fail -v 2>&1 \
@@ -121,24 +126,32 @@ HELP-NOT: {{.}}
 ; Check -dump-input-on-failure.
 ;--------------------------------------------------
 
+; Command-line option.
+
+; FileCheck success, -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input-on-failure -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -allow-empty \
 ; RUN:             -check-prefixes=NOTRACE,NODUMP
 
+; FileCheck fail, -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input-on-failure -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines \
 ; RUN:                -check-prefixes=NOTRACE,ERR,DUMP-ERR,DUMP-ERR-V
 
+; FILECHECK_DUMP_INPUT_ON_FAILURE=1.
+
+; FileCheck success, -v => no dump, no trace.
 ; RUN: %ProtectFileCheckOutput FILECHECK_DUMP_INPUT_ON_FAILURE=1 \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -allow-empty \
 ; RUN:                -check-prefixes=NOTRACE,NODUMP
 
+; FileCheck fail, -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput FILECHECK_DUMP_INPUT_ON_FAILURE=1 \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -v 2>&1 \
@@ -149,23 +162,105 @@ HELP-NOT: {{.}}
 ; Check -dump-input=always.
 ;--------------------------------------------------
 
+; FileCheck success, -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
 ; RUN:           -match-full-lines -dump-input=always -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,DUMP-OK
 
+; FileCheck fail, -v => dump, no trace.
 ; RUN: %ProtectFileCheckOutput \
 ; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
 ; RUN:               -match-full-lines -dump-input=always -v 2>&1 \
 ; RUN: | FileCheck %s -match-full-lines \
 ; RUN:                -check-prefixes=NOTRACE,ERR,DUMP-ERR,DUMP-ERR-V
 
+;--------------------------------------------------
+; Check multiple -dump-input options.
+;
+; This ocurrs most commonly when a test author specifies -dump-input on a
+; specific FileCheck call while a test runner specifies -dump-input in
+; FILECHECK_OPTS, but check the behavior generally.
+;
+; "help" has precedence, and then the most verbose value wins.  The most
+; common combinations involve "fail" and "always", so test those the most.
+;--------------------------------------------------
+
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+; Check duplicate.
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+
+; fail, fail => fail (FileCheck fail => dump)
+; RUN: %ProtectFileCheckOutput \
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=fail -dump-input=fail -v \
+; RUN:               2>&1 \
+; RUN: | FileCheck %s -match-full-lines \
+; RUN:                -check-prefixes=NOTRACE,ERR,DUMP-ERR,DUMP-ERR-V
+
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+; Check precedence.
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+
+; help, always => help
+; RUN: %ProtectFileCheckOutput \
+; RUN: FileCheck -input-file %t.err -color %t.check \
+; RUN:           -dump-input=help -dump-input=always \
+; RUN: | FileCheck %s -check-prefix=HELP
+
+; always, fail => always (FileCheck success => dump)
+; RUN: %ProtectFileCheckOutput \
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=always -dump-input=fail \
+; RUN:           -v 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,DUMP-OK
+
+; fail, never => fail (FileCheck fail => dump)
+; RUN: %ProtectFileCheckOutput \
+; RUN: not FileCheck -input-file %t.err %t.check -check-prefix=CHECK \
+; RUN:               -match-full-lines -dump-input=fail -dump-input=never -v \
+; RUN:               2>&1 \
+; RUN: | FileCheck %s -match-full-lines \
+; RUN:                -check-prefixes=NOTRACE,ERR,DUMP-ERR,DUMP-ERR-V
+
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+; Check that order doesn't matter.
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+
+; fail, always => always (FileCheck success => dump)
+; RUN: %ProtectFileCheckOutput \
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=fail -dump-input=always \
+; RUN:           -v 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,DUMP-OK
+
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+; Check that FILECHECK_OPTS isn't handled differently.
+;- - - - - - - - - - - - - - - - - - - - - - - - -
+
+; always, fail => always (FileCheck success => dump)
+; RUN: %ProtectFileCheckOutput FILECHECK_OPTS=-dump-input=always \
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=fail -v 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,DUMP-OK
+
+; fail, always => always (FileCheck success => dump)
+; RUN: %ProtectFileCheckOutput FILECHECK_OPTS=-dump-input=fail \
+; RUN: FileCheck -input-file %t.good %t.check -check-prefix=CHECK \
+; RUN:           -match-full-lines -dump-input=always -v 2>&1 \
+; RUN: | FileCheck %s -match-full-lines -check-prefixes=NOTRACE,DUMP-OK
+
 ; END.
 
 ;--------------------------------------------------
-; Check the output for all cases that actually process directives.
+; Check the output.
 ;--------------------------------------------------
 
+; HELP-NOT: {{.}}
+; HELP: The following description was requested by -dump-input=help
+; HELP: try{{.*}}-color
+; HELP-NOT: {{.}}
+
 ; Trace is sometimes suppressed.
 ; TRACE:       {{.*}}remark:{{.*}}
 ; NOTRACE-NOT: remark:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/debug-info-alloca.ll b/llvm/test/Instrumentation/AddressSanitizer/debug-info-alloca.ll
new file mode 100644
index 0000000000000..ba148e8d6e7a6
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/debug-info-alloca.ll
@@ -0,0 +1,75 @@
+; Checks that asan prologue does not add debug locations, which would
+; fool findPrologueEndLoc because it sets the end of the prologue to the
+; first instruction.  Breaking on the instrumented function in a debugger
+; would then stop at that instruction, before the prologue is finished.
+
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+; 1: void f(int *arg) {
+; 2: }
+; 3: int main(int argc, char **argv) {
+; 4:   f(&argc);
+; 5: }
+; clang 1.cc -g -S -emit-llvm -o - | sed 's/#0 = {/#0 = { sanitize_address/'
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local i32 @main(i32 %argc, i8** %argv) #0 !dbg !15 {
+entry:
+; No suffix like !dbg !123
+; CHECK: %asan_local_stack_base = alloca i64{{$}}
+; CHECK:     %3 = call i64 @__asan_stack_malloc_0(i64 64){{$}}
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 %argc, i32* %argc.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %argc.addr, metadata !21, metadata !DIExpression()), !dbg !22
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @llvm.dbg.declare(metadata i8*** %argv.addr, metadata !23, metadata !DIExpression()), !dbg !24
+  call void @f(i32* %argc.addr), !dbg !25
+  ret i32 0, !dbg !26
+}
+
+define dso_local void @f(i32* %arg) #0 !dbg !7 {
+entry:
+  %arg.addr = alloca i32*, align 8
+  store i32* %arg, i32** %arg.addr, align 8
+  call void @llvm.dbg.declare(metadata i32** %arg.addr, metadata !12, metadata !DIExpression()), !dbg !13
+  ret void, !dbg !14
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_address noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable willreturn }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 (git@github.com:llvm/llvm-project 1ac700cdef787383ad49a0e37d9894491ef19480)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None)
+!1 = !DIFile(filename: "2.c", directory: "/home/builduser")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 10.0.0 (git@github.com:llvm/llvm-project 1ac700cdef787383ad49a0e37d9894491ef19480)"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!13 = !DILocation(line: 1, column: 13, scope: !7)
+!14 = !DILocation(line: 2, column: 1, scope: !7)
+!15 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !16, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!16 = !DISubroutineType(types: !17)
+!17 = !{!11, !11, !18}
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64)
+!20 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!21 = !DILocalVariable(name: "argc", arg: 1, scope: !15, file: !1, line: 3, type: !11)
+!22 = !DILocation(line: 3, column: 14, scope: !15)
+!23 = !DILocalVariable(name: "argv", arg: 2, scope: !15, file: !1, line: 3, type: !18)
+!24 = !DILocation(line: 3, column: 27, scope: !15)
+!25 = !DILocation(line: 4, column: 3, scope: !15)
+!26 = !DILocation(line: 5, column: 1, scope: !15)
diff --git a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
index ad3a274c8272c..67e13e56414fd 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/local_stack_base.ll
@@ -18,8 +18,8 @@ entry:
   ; CHECK: %asan_local_stack_base = alloca i64
   ; CHECK: %[[ALLOCA:.*]] = ptrtoint i8* %MyAlloca to i64
   ; CHECK: %[[PHI:.*]] = phi i64 {{.*}} %[[ALLOCA]],
-  ; CHECK: store i64 %[[PHI]], i64* %asan_local_stack_base, !dbg
-  ; CHECK: call void @llvm.dbg.declare(metadata i64* %asan_local_stack_base, metadata !13, metadata !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32)), !dbg !14
+  ; CHECK: store i64 %[[PHI]], i64* %asan_local_stack_base
+  ; CHECK: call void @llvm.dbg.declare(metadata i64* %asan_local_stack_base, metadata !12, metadata !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32)), !dbg !13
   %0 = load i32, i32* %i.addr, align 4, !dbg !14
   %add = add nsw i32 %0, 2, !dbg !15
   ret i32 %add, !dbg !16
diff --git a/llvm/test/MC/AArch64/armv8.2a-crypto-apple.s b/llvm/test/MC/AArch64/armv8.2a-crypto-apple.s
new file mode 100644
index 0000000000000..1b9153136d057
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv8.2a-crypto-apple.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -output-asm-variant=1 -triple aarch64-apple-ios -mattr=+sha3,+sm4 -show-encoding < %s | FileCheck %s
+
+  sha512h.2d   q0, q1, v2
+  sha512h2.2d  q0, q1, v2
+  sha512su0.2d v11, v12
+  sha512su1.2d v11, v13, v14
+  eor3.16b  v25, v12, v7, v2
+  rax1.2d  v30, v29, v26
+  xar.2d v26, v21, v27, #63
+  bcax.16b  v31, v26, v2, v1
+
+//CHECK:  sha512h.2d   q0, q1, v2                    ; encoding: [0x20,0x80,0x62,0xce]
+//CHECK:  sha512h2.2d  q0, q1, v2                    ; encoding: [0x20,0x84,0x62,0xce]
+//CHECK:  sha512su0.2d v11, v12                      ; encoding: [0x8b,0x81,0xc0,0xce]
+//CHECK:  sha512su1.2d v11, v13, v14                 ; encoding: [0xab,0x89,0x6e,0xce]
+//CHECK:  eor3.16b  v25, v12, v7, v2                 ; encoding: [0x99,0x09,0x07,0xce]
+//CHECK:  rax1.2d  v30, v29, v26                     ; encoding: [0xbe,0x8f,0x7a,0xce]
+//CHECK:  xar.2d v26, v21, v27, #63                  ; encoding: [0xba,0xfe,0x9b,0xce]
+//CHECK:  bcax.16b  v31, v26, v2, v1                 ; encoding: [0x5f,0x07,0x22,0xce]
+
+
+
+  sm3ss1.4s  v20, v23, v21, v22
+  sm3tt1a.4s v20, v23, v21[3]
+  sm3tt1b.4s v20, v23, v21[3]
+  sm3tt2a.4s v20, v23, v21[3]
+  sm3tt2b.4s v20, v23, v21[3]
+  sm3partw1.4s v30, v29, v26
+  sm3partw2.4s v30, v29, v26
+  sm4ekey.4s v11, v11, v19
+  sm4e.4s  v2, v15
+
+// CHECK:  sm3ss1.4s  v20, v23, v21, v22             ; encoding: [0xf4,0x5a,0x55,0xce]
+// CHECK:  sm3tt1a.4s v20, v23, v21[3]               ; encoding: [0xf4,0xb2,0x55,0xce]
+// CHECK:  sm3tt1b.4s v20, v23, v21[3]               ; encoding: [0xf4,0xb6,0x55,0xce]
+// CHECK:  sm3tt2a.4s v20, v23, v21[3]               ; encoding: [0xf4,0xba,0x55,0xce]
+// CHECK:  sm3tt2b.4s v20, v23, v21[3]               ; encoding: [0xf4,0xbe,0x55,0xce]
+// CHECK:  sm3partw1.4s v30, v29, v26                ; encoding: [0xbe,0xc3,0x7a,0xce]
+// CHECK:  sm3partw2.4s v30, v29, v26                ; encoding: [0xbe,0xc7,0x7a,0xce]
+// CHECK:  sm4ekey.4s v11, v11, v19                  ; encoding: [0x6b,0xc9,0x73,0xce]
+// CHECK:  sm4e.4s v2, v15                           ; encoding: [0xe2,0x85,0xc0,0xce]
diff --git a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
index 2ca15fceccc8f..056a3ae86c07f 100644
--- a/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
+++ b/llvm/test/MC/AArch64/armv8.3a-signed-pointer.s
@@ -307,10 +307,10 @@
 // CHECK-REQ: error: instruction requires: pa
 // CHECK-REQ-NEXT:  ldrab x0, [x1]
   ldraa x0, [x1]!
-// CHECK-NEXT: ldraa x0, [x1]!  // encoding: [0x20,0x0c,0x20,0xf8]
+// CHECK-NEXT: ldraa x0, [x1, #0]!  // encoding: [0x20,0x0c,0x20,0xf8]
 // CHECK-REQ: error: instruction requires: pa
 // CHECK-REQ-NEXT:  ldraa x0, [x1]!
   ldrab x0, [x1]!
-// CHECK-NEXT: ldrab x0, [x1]!  // encoding: [0x20,0x0c,0xa0,0xf8]
+// CHECK-NEXT: ldrab x0, [x1, #0]!  // encoding: [0x20,0x0c,0xa0,0xf8]
 // CHECK-REQ: error: instruction requires: pa
 // CHECK-REQ-NEXT:  ldrab x0, [x1]!
diff --git a/llvm/test/MC/COFF/cfi-sections.s b/llvm/test/MC/COFF/cfi-sections.s
new file mode 100644
index 0000000000000..00a8d746c194d
--- /dev/null
+++ b/llvm/test/MC/COFF/cfi-sections.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -filetype=obj -triple x86_64-mingw32 %s -o - | llvm-objdump -r - | FileCheck --check-prefix=COFF_X86_64 %s
+// RUN: llvm-mc -filetype=obj -triple i686-mingw32 %s -o - | llvm-objdump -r - | FileCheck --check-prefix=COFF_I686 %s
+
+.cfi_sections .debug_frame
+
+f1:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+f2:
+        .cfi_startproc
+        nop
+        .cfi_endproc
+
+// COFF_X86_64: RELOCATION RECORDS FOR [.debug_frame]:
+// COFF_X86_64-NEXT: {{.*}} IMAGE_REL_AMD64_SECREL .debug_frame
+// COFF_X86_64-NEXT: {{.*}} IMAGE_REL_AMD64_ADDR64 .text
+// COFF_X86_64-NEXT: {{.*}} IMAGE_REL_AMD64_SECREL .debug_frame
+// COFF_X86_64-NEXT: {{.*}} IMAGE_REL_AMD64_ADDR64 .text
+
+// COFF_I686: RELOCATION RECORDS FOR [.debug_frame]:
+// COFF_I686-NEXT: {{.*}} IMAGE_REL_I386_SECREL .debug_frame
+// COFF_I686-NEXT: {{.*}} IMAGE_REL_I386_DIR32 .text
+// COFF_I686-NEXT: {{.*}} IMAGE_REL_I386_SECREL .debug_frame
+// COFF_I686-NEXT: {{.*}} IMAGE_REL_I386_DIR32 .text
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-signed-pointer.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-signed-pointer.txt
index d11056044fa48..7215d086c693c 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-signed-pointer.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-signed-pointer.txt
@@ -114,7 +114,7 @@
 [0x20,0x04,0x20,0xf8]
 [0x20,0x04,0xa0,0xf8]
 
-# CHECK: ldraa x0, [x1]!
-# CHECK: ldrab x0, [x1]!
+# CHECK: ldraa x0, [x1, #0]!
+# CHECK: ldrab x0, [x1, #0]!
 [0x20,0x0c,0x20,0xf8]
 [0x20,0x0c,0xa0,0xf8]
diff --git a/llvm/test/MC/MachO/reloc.s b/llvm/test/MC/MachO/reloc.s
index 1379d80eb310e..bab5d63d27f45 100644
--- a/llvm/test/MC/MachO/reloc.s
+++ b/llvm/test/MC/MachO/reloc.s
@@ -37,7 +37,7 @@ L0:
         .text
 _f0:
 L1:
-        jmp	0xbabecafe
+        jmp	0x7abecafe
         jmp L0
         jmp L1
         ret
diff --git a/llvm/test/MC/Mips/ll-expansion.s b/llvm/test/MC/Mips/ll-expansion.s
new file mode 100644
index 0000000000000..4653a33d7e787
--- /dev/null
+++ b/llvm/test/MC/Mips/ll-expansion.s
@@ -0,0 +1,406 @@
+# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips2 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
+# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
+# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32r2 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
+# RUN: llvm-mc -filetype=obj -triple mipsn32 -mcpu=mips3 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSN32
+# RUN: llvm-mc -filetype=obj -triple mipsn32 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSN32R6
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r2 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
+# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32R6
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64R6
+
+ll $2, 128($sp)
+# MIPS32:         c3 a2 00 80  ll     $2, 128($sp)
+# MIPS32R6:       7f a2 40 36  ll     $2, 128($sp)
+# MIPSN32:        c3 a2 00 80  ll     $2, 128($sp)
+# MIPSN32R6:      7f a2 40 36  ll     $2, 128($sp)
+# MIPS64:         c3 a2 00 80  ll     $2, 128($sp)
+# MIPS64R6:       7f a2 40 36  ll     $2, 128($sp)
+
+ll $2, -128($sp)
+# MIPS32:         c3 a2 ff 80  ll     $2, -128($sp)
+# MIPS32R6:       7f a2 c0 36  ll     $2, -128($sp)
+# MIPSN32:        c3 a2 ff 80  ll     $2, -128($sp)
+# MIPSN32R6:      7f a2 c0 36  ll     $2, -128($sp)
+# MIPS64:         c3 a2 ff 80  ll     $2, -128($sp)
+# MIPS64R6:       7f a2 c0 36  ll     $2, -128($sp)
+
+ll $2, 256($sp)
+# MIPS32:         c3 a2 01 00  ll     $2, 256($sp)
+
+# MIPS32R6:       27 a2 01 00  addiu  $2, $sp, 256
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        c3 a2 01 00  ll     $2, 256($sp)
+
+# MIPSN32R6:      27 a2 01 00  addiu  $2, $sp, 256
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         c3 a2 01 00  ll     $2, 256($sp)
+
+# MIPS64R6:       67 a2 01 00  daddiu $2, $sp, 256
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, -257($sp)
+# MIPS32:         c3 a2 fe ff  ll     $2, -257($sp)
+
+# MIPS32R6:       27 a2 fe ff  addiu  $2, $sp, -257
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        c3 a2 fe ff  ll     $2, -257($sp)
+
+# MIPSN32R6:      27 a2 fe ff  addiu  $2, $sp, -257
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         c3 a2 fe ff  ll     $2, -257($sp)
+
+# MIPS64R6:       67 a2 fe ff  daddiu $2, $sp, -257
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, 32767($sp)
+# MIPS32:         c3 a2 7f ff  ll     $2, 32767($sp)
+
+# MIPS32R6:       27 a2 7f ff  addiu  $2, $sp, 32767
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        c3 a2 7f ff  ll     $2, 32767($sp)
+
+# MIPSN32R6:      27 a2 7f ff  addiu  $2, $sp, 32767
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         c3 a2 7f ff  ll     $2, 32767($sp)
+
+# MIPS64R6:       67 a2 7f ff  daddiu $2, $sp, 32767
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, 32768($sp)
+# MIPS32:         3c 02 00 01  lui    $2, 1
+# MIPS32-NEXT:    00 5d 10 21  addu   $2, $2, $sp
+# MIPS32-NEXT:    c0 42 80 00  ll     $2, -32768($2)
+
+# MIPS32R6:       34 02 80 00  ori    $2, $zero, 32768
+# MIPS32R6-NEXT:  00 5d 10 21  addu   $2, $2, $sp
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        3c 02 00 01  lui    $2, 1
+# MIPSN32-NEXT:   00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32-NEXT:   c0 42 80 00  ll     $2, -32768($2)
+
+# MIPSN32R6:      34 02 80 00  ori    $2, $zero, 32768
+# MIPSN32R6-NEXT: 00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         3c 02 00 01  lui    $2, 1
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    c0 42 80 00  ll     $2, -32768($2)
+
+# MIPS64R6:       34 02 80 00  ori    $2, $zero, 32768
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, -32768($sp)
+# MIPS32:         c3 a2 80 00  ll     $2, -32768($sp)
+
+# MIPS32R6:       27 a2 80 00  addiu  $2, $sp, -32768
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        c3 a2 80 00  ll     $2, -32768($sp)
+
+# MIPSN32R6:      27 a2 80 00  addiu  $2, $sp, -32768
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         c3 a2 80 00  ll     $2, -32768($sp)
+
+# MIPS64R6:       67 a2 80 00  daddiu $2, $sp, -32768
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, -32769($sp)
+# MIPS32:         3c 02 ff ff  lui    $2, 65535
+# MIPS32-NEXT:    00 5d 10 21  addu   $2, $2, $sp
+# MIPS32-NEXT:    c0 42 7f ff  ll     $2, 32767($2)
+
+# MIPS32R6:       3c 02 ff ff  aui    $2, $zero, 65535
+# MIPS32R6-NEXT:  34 42 7f ff  ori    $2, $2, 32767
+# MIPS32R6-NEXT:  00 5d 10 21  addu   $2, $2, $sp
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        3c 02 ff ff  lui    $2, 65535
+# MIPSN32-NEXT:   00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32-NEXT:   c0 42 7f ff  ll     $2, 32767($2)
+
+# MIPSN32R6:      3c 02 ff ff  aui    $2, $zero, 65535
+# MIPSN32R6-NEXT: 34 42 7f ff  ori    $2, $2, 32767
+# MIPSN32R6-NEXT: 00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         3c 02 ff ff  lui    $2, 65535
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    c0 42 7f ff  ll     $2, 32767($2)
+
+# MIPS64R6:       3c 02 ff ff  aui    $2, $zero, 65535
+# MIPS64R6-NEXT:  34 42 7f ff  ori    $2, $2, 32767
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, 655987($sp)
+# MIPS32:         3c 02 00 0a  lui    $2, 10
+# MIPS32-NEXT:    00 5d 10 21  addu   $2, $2, $sp
+# MIPS32-NEXT:    c0 42 02 73  ll     $2, 627($2)
+
+# MIPS32R6:       3c 02 00 0a  aui    $2, $zero, 10
+# MIPS32R6-NEXT:  34 42 02 73  ori    $2, $2, 627
+# MIPS32R6-NEXT:  00 5d 10 21  addu   $2, $2, $sp
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        3c 02 00 0a  lui    $2, 10
+# MIPSN32-NEXT:   00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32-NEXT:   c0 42 02 73  ll     $2, 627($2)
+
+# MIPSN32R6:      3c 02 00 0a  aui    $2, $zero, 10
+# MIPSN32R6-NEXT: 34 42 02 73  ori    $2, $2, 627
+# MIPSN32R6-NEXT: 00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         3c 02 00 0a  lui    $2, 10
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    c0 42 02 73  ll     $2, 627($2)
+
+# MIPS64R6:       3c 02 00 0a  aui    $2, $zero, 10
+# MIPS64R6-NEXT:  34 42 02 73  ori    $2, $2, 627
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $2, -655987($sp)
+# MIPS32:         3c 02 ff f6  lui    $2, 65526
+# MIPS32-NEXT:    00 5d 10 21  addu   $2, $2, $sp
+# MIPS32-NEXT:    c0 42 fd 8d  ll     $2, -627($2)
+
+# MIPS32R6:       3c 02 ff f5  aui    $2, $zero, 65525
+# MIPS32R6-NEXT:  34 42 fd 8d  ori    $2, $2, 64909
+# MIPS32R6-NEXT:  00 5d 10 21  addu   $2, $2, $sp
+# MIPS32R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+# MIPSN32:        3c 02 ff f6  lui    $2, 65526
+# MIPSN32-NEXT:   00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32-NEXT:   c0 42 fd 8d  ll     $2, -627($2)
+
+# MIPSN32R6:      3c 02 ff f5  aui    $2, $zero, 65525
+# MIPSN32R6-NEXT: 34 42 fd 8d  ori    $2, $2, 64909
+# MIPSN32R6-NEXT: 00 5d 10 21  addu   $2, $2, $sp
+# MIPSN32R6-NEXT: 7c 42 00 36  ll     $2, 0($2)
+
+# MIPS64:         3c 02 ff f6  lui    $2, 65526
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    c0 42 fd 8d  ll     $2, -627($2)
+
+# MIPS64R6:       3c 02 ff f5  aui    $2, $zero, 65525
+# MIPS64R6-NEXT:  34 42 fd 8d  ori    $2, $2, 64909
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 36  ll     $2, 0($2)
+
+ll $12, symbol
+# MIPS32:         3c 0c 00 00  lui    $12, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  25 8c 00 00  addiu  $12, $12, 0
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+# MIPSN32:        3c 0c 00 00  lui    $12, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol
+# MIPSN32-NEXT:   c1 8c 00 00  ll     $12, 0($12)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol
+
+# MIPSN32R6:      3c 0c 00 00  aui    $12, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol
+# MIPSN32R6-NEXT: 25 8c 00 00  addiu  $12, $12, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol
+# MIPSN32R6-NEXT: 7d 8c 00 36  ll     $12, 0($12)
+
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+ll $12, symbol($3)
+# MIPS32:         3c 0c 00 00  lui    $12, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    01 83 60 21  addu   $12, $12, $3
+# MIPS32-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  25 8c 00 00  addiu  $12, $12, 0
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  01 83 60 21  addu   $12, $12, $3
+# MIPS32R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+# MIPSN32:        3c 0c 00 00  lui    $12, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol
+# MIPSN32-NEXT:   01 83 60 21  addu   $12, $12, $3
+# MIPSN32-NEXT:   c1 8c 00 00  ll     $12, 0($12)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol
+
+# MIPSN32R6:      3c 0c 00 00  aui    $12, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol
+# MIPSN32R6-NEXT: 25 8c 00 00  addiu  $12, $12, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol
+# MIPSN32R6-NEXT: 01 83 60 21  addu   $12, $12, $3
+# MIPSN32R6-NEXT: 7d 8c 00 36  ll     $12, 0($12)
+
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    01 83 60 2d  daddu  $12, $12, $3
+# MIPS64-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  01 83 60 2d  daddu  $12, $12, $3
+# MIPS64R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+ll $12, symbol+8
+# MIPS32:         3c 0c 00 00  lui    $12, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    c1 8c 00 08  ll     $12, 8($12)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  25 8c 00 08  addiu  $12, $12, 8
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+# MIPSN32:        3c 0c 00 00  lui    $12, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol+0x8
+# MIPSN32-NEXT:   c1 8c 00 00  ll     $12, 0($12)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol+0x8
+
+# MIPSN32R6:      3c 0c 00 00  aui    $12, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol+0x8
+# MIPSN32R6-NEXT: 25 8c 00 00  addiu  $12, $12, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol+0x8
+# MIPSN32R6-NEXT: 7d 8c 00 36  ll     $12, 0($12)
+
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+
+# MIPS64R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+.option pic2
+
+ll $12, symbol
+# MIPS32:         8f 8c 00 00  lw     $12, 0($gp)
+# MIPS32-NEXT:               R_MIPS_GOT16 symbol
+# MIPS32-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+
+# MIPS32R6:       8f 8c 00 00  lw     $12, 0($gp)
+# MIPS32R6-NEXT:             R_MIPS_GOT16 symbol
+# MIPS32R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+# MIPSN32:        8f 8c 00 00  lw     $12, 0($gp)
+# MIPSN32-NEXT:              R_MIPS_GOT_DISP symbol
+# MIPSN32-NEXT:   c1 8c 00 00  ll     $12, 0($12)
+
+# MIPSN32R6:      8f 8c 00 00  lw     $12, 0($gp)
+# MIPSN32R6-NEXT:            R_MIPS_GOT_DISP symbol
+# MIPSN32R6-NEXT: 7d 8c 00 36  ll     $12, 0($12)
+
+# MIPS64:         df 8c 00 00  ld     $12, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    c1 8c 00 00  ll     $12, 0($12)
+
+# MIPS64R6:       df 8c 00 00  ld     $12, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+ll $12, symbol+8
+# MIPS32:         8f 8c 00 00  lw     $12, 0($gp)
+# MIPS32-NEXT:               R_MIPS_GOT16 symbol
+# MIPS32-NEXT:    c1 8c 00 08  ll     $12, 8($12)
+
+# MIPS32R6:       8f 8c 00 00  lw     $12, 0($gp)
+# MIPS32R6-NEXT:             R_MIPS_GOT16 symbol
+# MIPS32R6-NEXT:  25 8c 00 08  addiu  $12, $12, 8
+# MIPS32R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
+
+# MIPSN32:        8f 8c 00 00  lw     $12, 0($gp)
+# MIPSN32-NEXT:              R_MIPS_GOT_DISP symbol
+# MIPSN32-NEXT:   c1 8c 00 08  ll     $12, 8($12)
+
+# MIPSN32R6:      8f 8c 00 00  lw     $12, 0($gp)
+# MIPSN32R6-NEXT:            R_MIPS_GOT_DISP symbol
+# MIPSN32R6-NEXT: 25 8c 00 08  addiu  $12, $12, 8
+# MIPSN32R6-NEXT: 7d 8c 00 36  ll     $12, 0($12)
+
+# MIPS64:         df 8c 00 00  ld     $12, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    c1 8c 00 08  ll     $12, 8($12)
+
+# MIPS64R6:       df 8c 00 00  ld     $12, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  65 8c 00 08  daddiu $12, $12, 8
+# MIPS64R6-NEXT:  7d 8c 00 36  ll     $12, 0($12)
diff --git a/llvm/test/MC/Mips/lld-expansion.s b/llvm/test/MC/Mips/lld-expansion.s
new file mode 100644
index 0000000000000..48755d59a2400
--- /dev/null
+++ b/llvm/test/MC/Mips/lld-expansion.s
@@ -0,0 +1,188 @@
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64R6
+
+lld $2, 128($sp)
+# MIPS64:         d3 a2 00 80  lld    $2, 128($sp)
+# MIPS64R6:       7f a2 40 37  lld    $2, 128($sp)
+
+lld $2, -128($sp)
+# MIPS64:         d3 a2 ff 80  lld    $2, -128($sp)
+# MIPS64R6:       7f a2 c0 37  lld    $2, -128($sp)
+
+lld $2, 256($sp)
+# MIPS64:         d3 a2 01 00  lld    $2, 256($sp)
+
+# MIPS64R6:       67 a2 01 00  daddiu $2, $sp, 256
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, -257($sp)
+# MIPS64:         d3 a2 fe ff  lld    $2, -257($sp)
+
+# MIPS64R6:       67 a2 fe ff  daddiu $2, $sp, -257
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, 32767($sp)
+# MIPS64:         d3 a2 7f ff  lld    $2, 32767($sp)
+
+# MIPS64R6:       67 a2 7f ff  daddiu $2, $sp, 32767
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, 32768($sp)
+# MIPS64:         3c 02 00 01  lui    $2, 1
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    d0 42 80 00  lld    $2, -32768($2)
+
+# MIPS64R6:       34 02 80 00  ori    $2, $zero, 32768
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, -32768($sp)
+# MIPS64:         d3 a2 80 00  lld    $2, -32768($sp)
+
+# MIPS64R6:       67 a2 80 00  daddiu $2, $sp, -32768
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, -32769($sp)
+# MIPS64:         3c 02 ff ff  lui    $2, 65535
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    d0 42 7f ff  lld    $2, 32767($2)
+
+# MIPS64R6:       3c 02 ff ff  aui    $2, $zero, 65535
+# MIPS64R6-NEXT:  34 42 7f ff  ori    $2, $2, 32767
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, 2147483648($sp)
+# MIPS64:         34 02 80 00  ori    $2, $zero, 32768
+# MIPS64-NEXT:    00 02 14 38  dsll   $2, $2, 16
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    d0 42 00 00  lld    $2, 0($2)
+
+# MIPS64R6:       34 02 80 00  ori    $2, $zero, 32768
+# MIPS64R6-NEXT:  00 02 14 38  dsll   $2, $2, 16
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, -2147483648($sp)
+# MIPS64:         3c 02 80 00  lui    $2, 32768
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    d0 42 00 00  lld    $2, 0($2)
+
+# MIPS64R6:       3c 02 80 00  aui    $2, $zero, 32768
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $2, 9223372036853775808($sp)
+# MIPS64:         3c 02 7f ff  lui    $2, 32767
+# MIPS64-NEXT:    34 42 ff ff  ori    $2, $2, 65535
+# MIPS64-NEXT:    00 02 14 38  dsll   $2, $2, 16
+# MIPS64-NEXT:    34 42 ff f1  ori    $2, $2, 65521
+# MIPS64-NEXT:    00 02 14 38  dsll   $2, $2, 16
+# MIPS64-NEXT:    00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64-NEXT:    d0 42 bd c0  lld    $2, -16960($2)
+
+# MIPS64R6:       3c 02 7f ff  aui    $2, $zero, 32767
+# MIPS64R6-NEXT:  34 42 ff ff  ori    $2, $2, 65535
+# MIPS64R6-NEXT:  00 02 14 38  dsll   $2, $2, 16
+# MIPS64R6-NEXT:  34 42 ff f0  ori    $2, $2, 65520
+# MIPS64R6-NEXT:  00 02 14 38  dsll   $2, $2, 16
+# MIPS64R6-NEXT:  34 42 bd c0  ori    $2, $2, 48576
+# MIPS64R6-NEXT:  00 5d 10 2d  daddu  $2, $2, $sp
+# MIPS64R6-NEXT:  7c 42 00 37  lld    $2, 0($2)
+
+lld $12, symbol
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    d1 8c 00 00  lld    $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  7d 8c 00 37  lld    $12, 0($12)
+
+lld $12, symbol($3)
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    01 83 60 2d  daddu  $12, $12, $3
+# MIPS64-NEXT:    d1 8c 00 00  lld    $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6-NEXT:  3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  01 83 60 2d  daddu  $12, $12, $3
+# MIPS64R6-NEXT:  7d 8c 00 37  lld    $12, 0($12)
+
+lld $12, symbol+8
+# MIPS64:         3c 0c 00 00  lui    $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    65 8c 00 00  daddiu $12, $12, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 0c 64 38  dsll   $12, $12, 16
+# MIPS64-NEXT:    d1 8c 00 00  lld    $12, 0($12)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+
+# MIPS64R6-NEXT:  3c 0c 00 00  aui    $12, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  65 8c 00 00  daddiu $12, $12, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 0c 60 3c  dsll32 $12, $12, 0
+# MIPS64R6-NEXT:  01 81 60 2d  daddu  $12, $12, $1
+# MIPS64R6-NEXT:  7d 8c 00 37  lld    $12, 0($12)
+
+.option pic2
+
+lld $12, symbol
+# MIPS64:         df 8c 00 00  ld     $12, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    d1 8c 00 00  lld    $12, 0($12)
+
+# MIPS64R6:       df 8c 00 00  ld     $12, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  7d 8c 00 37  lld    $12, 0($12)
+
+lld $12, symbol+8
+# MIPS64:         df 8c 00 00  ld     $12, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    d1 8c 00 08  lld    $12, 8($12)
+
+# MIPS64R6:       df 8c 00 00  ld     $12, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  65 8c 00 08  daddiu $12, $12, 8
+# MIPS64R6-NEXT:  7d 8c 00 37  lld    $12, 0($12)
diff --git a/llvm/test/MC/Mips/sc-expansion.s b/llvm/test/MC/Mips/sc-expansion.s
index 76b30f174f9e2..b407f7aaf5700 100644
--- a/llvm/test/MC/Mips/sc-expansion.s
+++ b/llvm/test/MC/Mips/sc-expansion.s
@@ -1,48 +1,406 @@
 # RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips2 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
 # RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
 # RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32r2 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
-# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips3 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
-# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips64 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
-# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips64r2 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32
+# RUN: llvm-mc -filetype=obj -triple mipsn32 -mcpu=mips3 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSN32
+# RUN: llvm-mc -filetype=obj -triple mipsn32 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSN32R6
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r2 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
 # RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips32r6 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSR6
-# RUN: llvm-mc -filetype=obj -triple mips -mcpu=mips64r6 %s -o - \
-# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPSR6
-
-# MIPS:         e0 6c 00 00    sc   $12, 0($3)
-# MIPSR6:       7c 6c 00 26    sc   $12, 0($3)
-sc $12, 0($3)
-
-# MIPS:         e0 6c 00 04    sc   $12, 4($3)
-# MIPSR6:       7c 6c 02 26    sc   $12, 4($3)
-sc $12, 4($3)
-
-# MIPS:         3c 01 00 00    lui  $1, 0
-# MIPS:                    R_MIPS_HI16  symbol
-# MIPS:         e0 2c 00 00    sc   $12, 0($1)
-# MIPS:                    R_MIPS_LO16  symbol
-
-# MIPSR6:       3c 01 00 00     aui    $1, $zero, 0
-# MIPSR6:			             R_MIPS_HI16	symbol
-# MIPSR6:       24 21 00 00     addiu  $1, $1, 0
-# MIPSR6:			             R_MIPS_LO16	symbol
-# MIPSR6:       7c 2c 00 26     sc     $12, 0($1)
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS32R6
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64R6
+
+sc $2, 128($sp)
+# MIPS32:         e3 a2 00 80  sc     $2, 128($sp)
+# MIPS32R6:       7f a2 40 26  sc     $2, 128($sp)
+# MIPSN32:        e3 a2 00 80  sc     $2, 128($sp)
+# MIPSN32R6:      7f a2 40 26  sc     $2, 128($sp)
+# MIPS64:         e3 a2 00 80  sc     $2, 128($sp)
+# MIPS64R6:       7f a2 40 26  sc     $2, 128($sp)
+
+sc $2, -128($sp)
+# MIPS32:         e3 a2 ff 80  sc     $2, -128($sp)
+# MIPS32R6:       7f a2 c0 26  sc     $2, -128($sp)
+# MIPSN32:        e3 a2 ff 80  sc     $2, -128($sp)
+# MIPSN32R6:      7f a2 c0 26  sc     $2, -128($sp)
+# MIPS64:         e3 a2 ff 80  sc     $2, -128($sp)
+# MIPS64R6:       7f a2 c0 26  sc     $2, -128($sp)
+
+sc $2, 256($sp)
+# MIPS32:         e3 a2 01 00  sc     $2, 256($sp)
+
+# MIPS32R6:       27 a1 01 00  addiu  $1, $sp, 256
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        e3 a2 01 00  sc     $2, 256($sp)
+
+# MIPSN32R6:      27 a1 01 00  addiu  $1, $sp, 256
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         e3 a2 01 00  sc     $2, 256($sp)
+
+# MIPS64R6:       67 a1 01 00  daddiu $1, $sp, 256
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, -257($sp)
+# MIPS32:         e3 a2 fe ff  sc     $2, -257($sp)
+
+# MIPS32R6:       27 a1 fe ff  addiu  $1, $sp, -257
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        e3 a2 fe ff  sc     $2, -257($sp)
+
+# MIPSN32R6:      27 a1 fe ff  addiu  $1, $sp, -257
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         e3 a2 fe ff  sc     $2, -257($sp)
+
+# MIPS64R6:       67 a1 fe ff  daddiu $1, $sp, -257
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, 32767($sp)
+# MIPS32:         e3 a2 7f ff  sc     $2, 32767($sp)
+
+# MIPS32R6:       27 a1 7f ff  addiu  $1, $sp, 32767
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        e3 a2 7f ff  sc     $2, 32767($sp)
+
+# MIPSN32R6:      27 a1 7f ff  addiu  $1, $sp, 32767
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         e3 a2 7f ff  sc     $2, 32767($sp)
+
+# MIPS64R6:       67 a1 7f ff  daddiu $1, $sp, 32767
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, 32768($sp)
+# MIPS32:         3c 01 00 01  lui    $1, 1
+# MIPS32-NEXT:    00 3d 08 21  addu   $1, $1, $sp
+# MIPS32-NEXT:    e0 22 80 00  sc     $2, -32768($1)
+
+# MIPS32R6:       34 01 80 00  ori    $1, $zero, 32768
+# MIPS32R6-NEXT:  00 3d 08 21  addu   $1, $1, $sp
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        3c 01 00 01  lui    $1, 1
+# MIPSN32-NEXT:   00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32-NEXT:   e0 22 80 00  sc     $2, -32768($1)
+
+# MIPSN32R6:      34 01 80 00  ori    $1, $zero, 32768
+# MIPSN32R6-NEXT: 00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         3c 01 00 01  lui    $1, 1
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    e0 22 80 00  sc     $2, -32768($1)
+
+# MIPS64R6:       34 01 80 00  ori    $1, $zero, 32768
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, -32768($sp)
+# MIPS32:         e3 a2 80 00  sc     $2, -32768($sp)
+
+# MIPS32R6:       27 a1 80 00  addiu  $1, $sp, -32768
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        e3 a2 80 00  sc     $2, -32768($sp)
+
+# MIPSN32R6:      27 a1 80 00  addiu  $1, $sp, -32768
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         e3 a2 80 00  sc     $2, -32768($sp)
+
+# MIPS64R6:       67 a1 80 00  daddiu $1, $sp, -32768
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, -32769($sp)
+# MIPS32:         3c 01 ff ff  lui    $1, 65535
+# MIPS32-NEXT:    00 3d 08 21  addu   $1, $1, $sp
+# MIPS32-NEXT:    e0 22 7f ff  sc     $2, 32767($1)
+
+# MIPS32R6:       3c 01 ff ff  aui    $1, $zero, 65535
+# MIPS32R6-NEXT:  34 21 7f ff  ori    $1, $1, 32767
+# MIPS32R6-NEXT:  00 3d 08 21  addu   $1, $1, $sp
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        3c 01 ff ff  lui    $1, 65535
+# MIPSN32-NEXT:   00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32-NEXT:   e0 22 7f ff  sc     $2, 32767($1)
+
+# MIPSN32R6:      3c 01 ff ff  aui    $1, $zero, 65535
+# MIPSN32R6-NEXT: 34 21 7f ff  ori    $1, $1, 32767
+# MIPSN32R6-NEXT: 00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         3c 01 ff ff  lui    $1, 65535
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    e0 22 7f ff  sc     $2, 32767($1)
+
+# MIPS64R6:       3c 01 ff ff  aui    $1, $zero, 65535
+# MIPS64R6-NEXT:  34 21 7f ff  ori    $1, $1, 32767
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, 655987($sp)
+# MIPS32:         3c 01 00 0a  lui    $1, 10
+# MIPS32-NEXT:    00 3d 08 21  addu   $1, $1, $sp
+# MIPS32-NEXT:    e0 22 02 73  sc     $2, 627($1)
+
+# MIPS32R6:       3c 01 00 0a  aui    $1, $zero, 10
+# MIPS32R6-NEXT:  34 21 02 73  ori    $1, $1, 627
+# MIPS32R6-NEXT:  00 3d 08 21  addu   $1, $1, $sp
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        3c 01 00 0a  lui    $1, 10
+# MIPSN32-NEXT:   00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32-NEXT:   e0 22 02 73  sc     $2, 627($1)
+
+# MIPSN32R6:      3c 01 00 0a  aui    $1, $zero, 10
+# MIPSN32R6-NEXT: 34 21 02 73  ori    $1, $1, 627
+# MIPSN32R6-NEXT: 00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         3c 01 00 0a  lui    $1, 10
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    e0 22 02 73  sc     $2, 627($1)
+
+# MIPS64R6:       3c 01 00 0a  aui    $1, $zero, 10
+# MIPS64R6-NEXT:  34 21 02 73  ori    $1, $1, 627
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $2, -655987($sp)
+# MIPS32:         3c 01 ff f6  lui    $1, 65526
+# MIPS32-NEXT:    00 3d 08 21  addu   $1, $1, $sp
+# MIPS32-NEXT:    e0 22 fd 8d  sc     $2, -627($1)
+
+# MIPS32R6:       3c 01 ff f5  aui    $1, $zero, 65525
+# MIPS32R6-NEXT:  34 21 fd 8d  ori    $1, $1, 64909
+# MIPS32R6-NEXT:  00 3d 08 21  addu   $1, $1, $sp
+# MIPS32R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+# MIPSN32:        3c 01 ff f6  lui    $1, 65526
+# MIPSN32-NEXT:   00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32-NEXT:   e0 22 fd 8d  sc     $2, -627($1)
+
+# MIPSN32R6:      3c 01 ff f5  aui    $1, $zero, 65525
+# MIPSN32R6-NEXT: 34 21 fd 8d  ori    $1, $1, 64909
+# MIPSN32R6-NEXT: 00 3d 08 21  addu   $1, $1, $sp
+# MIPSN32R6-NEXT: 7c 22 00 26  sc     $2, 0($1)
+
+# MIPS64:         3c 01 ff f6  lui    $1, 65526
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    e0 22 fd 8d  sc     $2, -627($1)
+
+# MIPS64R6:       3c 01 ff f5  aui    $1, $zero, 65525
+# MIPS64R6-NEXT:  34 21 fd 8d  ori    $1, $1, 64909
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 26  sc     $2, 0($1)
+
+sc $12, symbol
+# MIPS32:         3c 01 00 00  lui    $1, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  24 21 00 00  addiu  $1, $1, 0
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+# MIPSN32:        3c 01 00 00  lui    $1, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol
+# MIPSN32-NEXT:   e0 2c 00 00  sc     $12, 0($1)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol
+
+# MIPSN32R6:      3c 01 00 00  aui    $1, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol
+# MIPSN32R6-NEXT: 24 21 00 00  addiu  $1, $1, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol
+# MIPSN32R6-NEXT: 7c 2c 00 26  sc     $12, 0($1)
+
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+sc $12, symbol($3)
+# MIPS32:         3c 01 00 00  lui    $1, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    00 23 08 21  addu   $1, $1, $3
+# MIPS32-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  24 21 00 00  addiu  $1, $1, 0
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  00 23 08 21  addu   $1, $1, $3
+# MIPS32R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+# MIPSN32:        3c 01 00 00  lui    $1, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol
+# MIPSN32-NEXT:   00 23 08 21  addu   $1, $1, $3
+# MIPSN32-NEXT:   e0 2c 00 00  sc     $12, 0($1)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol
+
+# MIPSN32R6:      3c 01 00 00  aui    $1, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol
+# MIPSN32R6-NEXT: 24 21 00 00  addiu  $1, $1, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol
+# MIPSN32R6-NEXT: 00 23 08 21  addu   $1, $1, $3
+# MIPSN32R6-NEXT: 7c 2c 00 26  sc     $12, 0($1)
+
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    00 23 08 2d  daddu  $1, $1, $3
+# MIPS64-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 23 08 2d  daddu  $1, $1, $3
+# MIPS64R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+sc $12, symbol+8
+# MIPS32:         3c 01 00 00  lui    $1, 0
+# MIPS32-NEXT:               R_MIPS_HI16  symbol
+# MIPS32-NEXT:    e0 2c 00 08  sc     $12, 8($1)
+# MIPS32-NEXT:               R_MIPS_LO16  symbol
+
+# MIPS32R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS32R6-NEXT:             R_MIPS_HI16 symbol
+# MIPS32R6-NEXT:  24 21 00 08  addiu  $1, $1, 8
+# MIPS32R6-NEXT:             R_MIPS_LO16 symbol
+# MIPS32R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+# MIPSN32:        3c 01 00 00  lui    $1, 0
+# MIPSN32-NEXT:              R_MIPS_HI16  symbol+0x8
+# MIPSN32-NEXT:   e0 2c 00 00  sc     $12, 0($1)
+# MIPSN32-NEXT:              R_MIPS_LO16  symbol+0x8
+
+# MIPSN32R6:      3c 01 00 00  aui    $1, $zero, 0
+# MIPSN32R6-NEXT:            R_MIPS_HI16 symbol+0x8
+# MIPSN32R6-NEXT: 24 21 00 00  addiu  $1, $1, 0
+# MIPSN32R6-NEXT:            R_MIPS_LO16 symbol+0x8
+# MIPSN32R6-NEXT: 7c 2c 00 26  sc     $12, 0($1)
+
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+.option pic2
+
 sc $12, symbol
+# MIPS32:         8f 81 00 00  lw     $1, 0($gp)
+# MIPS32-NEXT:               R_MIPS_GOT16 symbol
+# MIPS32-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+
+# MIPS32R6:       8f 81 00 00  lw     $1, 0($gp)
+# MIPS32R6-NEXT:             R_MIPS_GOT16 symbol
+# MIPS32R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+# MIPSN32:        8f 81 00 00  lw     $1, 0($gp)
+# MIPSN32-NEXT:              R_MIPS_GOT_DISP symbol
+# MIPSN32-NEXT:   e0 2c 00 00  sc     $12, 0($1)
+
+# MIPSN32R6:      8f 81 00 00  lw     $1, 0($gp)
+# MIPSN32R6-NEXT:            R_MIPS_GOT_DISP symbol
+# MIPSN32R6-NEXT: 7c 2c 00 26  sc     $12, 0($1)
+
+# MIPS64:         df 81 00 00  ld     $1, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    e0 2c 00 00  sc     $12, 0($1)
+
+# MIPS64R6:       df 81 00 00  ld     $1, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+sc $12, symbol+8
+# MIPS32:         8f 81 00 00  lw     $1, 0($gp)
+# MIPS32-NEXT:               R_MIPS_GOT16 symbol
+# MIPS32-NEXT:    e0 2c 00 08  sc     $12, 8($1)
+
+# MIPS32R6:       8f 81 00 00  lw     $1, 0($gp)
+# MIPS32R6-NEXT:             R_MIPS_GOT16 symbol
+# MIPS32R6-NEXT:  24 21 00 08  addiu  $1, $1, 8
+# MIPS32R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
+
+# MIPSN32:        8f 81 00 00  lw     $1, 0($gp)
+# MIPSN32-NEXT:              R_MIPS_GOT_DISP symbol
+# MIPSN32-NEXT:   e0 2c 00 08  sc     $12, 8($1)
+
+# MIPSN32R6:      8f 81 00 00  lw     $1, 0($gp)
+# MIPSN32R6-NEXT:            R_MIPS_GOT_DISP symbol
+# MIPSN32R6-NEXT: 24 21 00 08  addiu  $1, $1, 8
+# MIPSN32R6-NEXT: 7c 2c 00 26  sc     $12, 0($1)
+
+# MIPS64:         df 81 00 00  ld     $1, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    e0 2c 00 08  sc     $12, 8($1)
 
-# MIPS:         3c 01 00 00    lui  $1, 0
-# MIPS:                    R_MIPS_HI16  symbol
-# MIPS:         e0 2c 00 08    sc   $12, 8($1)
-# MIPS:                    R_MIPS_LO16  symbol
-
-# MIPSR6:       3c 01 00 00     aui    $1, $zero, 0
-# MIPSR6:                  R_MIPS_HI16	symbol
-# MIPSR6:       24 21 00 08     addiu  $1, $1, 8
-# MIPSR6:                  R_MIPS_LO16	symbol
-# MIPSR6:       7c 2c 00 26     sc     $12, 0($1)
-sc $12, symbol + 8
+# MIPS64R6:       df 81 00 00  ld     $1, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  64 21 00 08  daddiu $1, $1, 8
+# MIPS64R6-NEXT:  7c 2c 00 26  sc     $12, 0($1)
diff --git a/llvm/test/MC/Mips/scd-expansion.s b/llvm/test/MC/Mips/scd-expansion.s
new file mode 100644
index 0000000000000..54a3baa5d68fe
--- /dev/null
+++ b/llvm/test/MC/Mips/scd-expansion.s
@@ -0,0 +1,188 @@
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64
+# RUN: llvm-mc -filetype=obj -triple mips64 -mcpu=mips64r6 %s -o - \
+# RUN:   | llvm-objdump -d -r - | FileCheck %s --check-prefix=MIPS64R6
+
+scd $2, 128($sp)
+# MIPS64:         f3 a2 00 80  scd    $2, 128($sp)
+# MIPS64R6:       7f a2 40 27  scd    $2, 128($sp)
+
+scd $2, -128($sp)
+# MIPS64:         f3 a2 ff 80  scd    $2, -128($sp)
+# MIPS64R6:       7f a2 c0 27  scd    $2, -128($sp)
+
+scd $2, 256($sp)
+# MIPS64:         f3 a2 01 00  scd    $2, 256($sp)
+
+# MIPS64R6:       67 a1 01 00  daddiu $1, $sp, 256
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, -257($sp)
+# MIPS64:         f3 a2 fe ff  scd    $2, -257($sp)
+
+# MIPS64R6:       67 a1 fe ff  daddiu $1, $sp, -257
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, 32767($sp)
+# MIPS64:         f3 a2 7f ff  scd    $2, 32767($sp)
+
+# MIPS64R6:       67 a1 7f ff  daddiu $1, $sp, 32767
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, 32768($sp)
+# MIPS64:         3c 01 00 01  lui    $1, 1
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    f0 22 80 00  scd    $2, -32768($1)
+
+# MIPS64R6:       34 01 80 00  ori    $1, $zero, 32768
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, -32768($sp)
+# MIPS64:         f3 a2 80 00  scd    $2, -32768($sp)
+
+# MIPS64R6:       67 a1 80 00  daddiu $1, $sp, -32768
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, -32769($sp)
+# MIPS64:         3c 01 ff ff  lui    $1, 65535
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    f0 22 7f ff  scd    $2, 32767($1)
+
+# MIPS64R6:       3c 01 ff ff  aui    $1, $zero, 65535
+# MIPS64R6-NEXT:  34 21 7f ff  ori    $1, $1, 32767
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, 2147483648($sp)
+# MIPS64:         34 01 80 00  ori    $1, $zero, 32768
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    f0 22 00 00  scd    $2, 0($1)
+
+# MIPS64R6:       34 01 80 00  ori    $1, $zero, 32768
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, -2147483648($sp)
+# MIPS64:         3c 01 80 00  lui    $1, 32768
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    f0 22 00 00  scd    $2, 0($1)
+
+# MIPS64R6:       3c 01 80 00  aui    $1, $zero, 32768
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $2, 9223372036853775808($sp)
+# MIPS64:         3c 01 7f ff  lui    $1, 32767
+# MIPS64-NEXT:    34 21 ff ff  ori    $1, $1, 65535
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    34 21 ff f1  ori    $1, $1, 65521
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64-NEXT:    f0 22 bd c0  scd    $2, -16960($1)
+
+# MIPS64R6:       3c 01 7f ff  aui    $1, $zero, 32767
+# MIPS64R6-NEXT:  34 21 ff ff  ori    $1, $1, 65535
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  34 21 ff f0  ori    $1, $1, 65520
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  34 21 bd c0  ori    $1, $1, 48576
+# MIPS64R6-NEXT:  00 3d 08 2d  daddu  $1, $1, $sp
+# MIPS64R6-NEXT:  7c 22 00 27  scd    $2, 0($1)
+
+scd $12, symbol
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    f0 2c 00 00  scd    $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  7c 2c 00 27  scd    $12, 0($1)
+
+scd $12, symbol($3)
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    00 23 08 2d  daddu  $1, $1, $3
+# MIPS64-NEXT:    f0 2c 00 00  scd    $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol
+# MIPS64R6-NEXT:  00 23 08 2d  daddu  $1, $1, $3
+# MIPS64R6-NEXT:  7c 2c 00 27  scd    $12, 0($1)
+
+scd $12, symbol+8
+# MIPS64:         3c 01 00 00  lui    $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    64 21 00 00  daddiu $1, $1, 0
+# MIPS64-NEXT:               R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64-NEXT:    00 01 0c 38  dsll   $1, $1, 16
+# MIPS64-NEXT:    f0 2c 00 00  scd    $12, 0($1)
+# MIPS64-NEXT:               R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+
+# MIPS64R6:       3c 01 00 00  aui    $1, $zero, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHEST/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HIGHER/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_HI16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  00 01 0c 38  dsll   $1, $1, 16
+# MIPS64R6-NEXT:  64 21 00 00  daddiu $1, $1, 0
+# MIPS64R6-NEXT:             R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE  symbol+0x8
+# MIPS64R6-NEXT:  7c 2c 00 27  scd    $12, 0($1)
+
+.option pic2
+
+scd $12, symbol
+# MIPS64:         df 81 00 00  ld     $1, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    f0 2c 00 00  scd    $12, 0($1)
+
+# MIPS64R6:       df 81 00 00  ld     $1, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  7c 2c 00 27  scd    $12, 0($1)
+
+scd $12, symbol+8
+# MIPS64:         df 81 00 00  ld     $1, 0($gp)
+# MIPS64-NEXT:               R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64-NEXT:    f0 2c 00 08  scd    $12, 8($1)
+
+# MIPS64R6:       df 81 00 00  ld     $1, 0($gp)
+# MIPS64R6-NEXT:             R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE symbol
+# MIPS64R6-NEXT:  64 21 00 08  daddiu $1, $1, 8
+# MIPS64R6-NEXT:  7c 2c 00 27  scd    $12, 0($1)
diff --git a/llvm/test/MC/X86/x86-jcxz-loop-fixup.s b/llvm/test/MC/X86/x86-jcxz-loop-fixup.s
new file mode 100644
index 0000000000000..219c1bb52eb6b
--- /dev/null
+++ b/llvm/test/MC/X86/x86-jcxz-loop-fixup.s
@@ -0,0 +1,26 @@
+# RUN: not llvm-mc -filetype=obj -triple=x86_64-linux-gnu %s 2>&1 | FileCheck %s
+
+       .balign 128 
+label00:
+// CHECK: value of 253 is too large for field of 1 byte.
+  jecxz   label01
+// CHECK: value of 251 is too large for field of 1 byte.
+  jrcxz   label01
+// CHECK: value of 249 is too large for field of 1 byte.
+  loop  label01
+// CHECK: value of 247 is too large for field of 1 byte. 
+  loope  label01
+// CHECK: value of 245 is too large for field of 1 byte.
+  loopne  label01
+        .balign 256 
+label01:
+// CHECK: value of -259 is too large for field of 1 byte.
+  jecxz   label00
+// CHECK: value of -261 is too large for field of 1 byte.
+  jrcxz   label00
+// CHECK: value of -263 is too large for field of 1 byte.
+  loop  label00
+// CHECK: value of -265 is too large for field of 1 byte.
+  loope  label00
+// CHECK: value of -267 is too large for field of 1 byte.
+  loopne  label00
diff --git a/llvm/test/MachineVerifier/verify-regops.mir b/llvm/test/MachineVerifier/verify-regops.mir
new file mode 100644
index 0000000000000..9219586ffc03b
--- /dev/null
+++ b/llvm/test/MachineVerifier/verify-regops.mir
@@ -0,0 +1,37 @@
+# RUN: not llc -march=x86 -o - %s -run-pass=none -verify-machineinstrs \
+# RUN:   2>&1 | FileCheck %s
+# REQUIRES: x86-registered-target
+#
+# Check that MachineVerifier catches corrupt operands where MO->isReg()
+# returns true, but the descriptor says it should be an OPERAND_IMMEDIATE or
+# OPERAND_PCREL. Conversely, if MO->isReg() (and MO->isFI()) returns false,
+# check that not an OPERAND_REGISTER is expected.
+
+# CHECK-LABEL: fun
+
+# CHECK: *** Bad machine code: Expected a register operand. ***
+# CHECK: - instruction: %1:gr32 = XOR32rm -1, %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load 4 from %fixed-stack.1, align 8)
+# CHECK: - operand 1:   -1
+
+# CHECK: *** Bad machine code: Expected a non-register operand. ***
+# CHECK: - instruction: %2:gr32 = OR32ri %1:gr32(tied-def 0), %0:gr32, implicit-def dead $eflags
+# CHECK: - operand 2:   %0:gr32
+
+
+name:            fun
+tracksRegLiveness: true
+fixedStack:
+  - { id: 1, offset: 8, size: 4, alignment: 8, isImmutable: true }
+  - { id: 3, size: 4, alignment: 16, isImmutable: true }
+body:             |
+  bb.0:
+    %0:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.3, align 16)
+    ; Was: %1:gr32 = XOR32rm %0, %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load 4 from %fixed-stack.1, align 8)
+    %1:gr32 = XOR32rm -1, %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load 4 from %fixed-stack.1, align 8)
+    ; Was: %2:gr32 = OR32ri %1, -256, implicit-def dead $eflags
+    %2:gr32 = OR32ri %1, %0, implicit-def dead $eflags
+    %3:gr32 = MOV32ri -1
+    $eax = COPY %2
+    $edx = COPY %3
+    RET 0, $eax, $edx
+...
diff --git a/llvm/test/Object/invalid.test b/llvm/test/Object/invalid.test
index 254b8f43e7bcd..37563652bd630 100644
--- a/llvm/test/Object/invalid.test
+++ b/llvm/test/Object/invalid.test
@@ -41,7 +41,7 @@ Sections:
 # RUN: not llvm-objdump -s %p/Inputs/invalid-strtab-size.elf 2>&1 \
 # RUN:   | FileCheck %s -DFILE=%p/Inputs/invalid-strtab-size.elf --check-prefix=INVALID-STRTAB-SIZE
 
-# INVALID-STRTAB-SIZE: error: '[[FILE]]': section [index 1] has a sh_offset (0x70) + sh_size (0xffffff) that cannot be represented
+# INVALID-STRTAB-SIZE: error: '[[FILE]]': section [index 1] has a sh_offset (0x70) + sh_size (0xffffff) that is greater than the file size (0x218)
 
 ## Check that llvm-dwarfdump reports an error during relocation resolution
 ## when instead of expected SHT_RELA section it locates a section of a different type.
@@ -252,7 +252,7 @@ Symbols: []
 # RUN: not llvm-readobj -r %t12 2>&1 | FileCheck -DFILE=%t12 --check-prefix=INVALID-RELOC-SH-OFFSET %s
 # RUN: not llvm-readobj -r %t13 2>&1 | FileCheck -DFILE=%t13 --check-prefix=INVALID-RELOC-SH-OFFSET %s
 
-# INVALID-RELOC-SH-OFFSET: error: '[[FILE]]': section [index 1] has a sh_offset (0x10000) + sh_size (0x0) that cannot be represented
+# INVALID-RELOC-SH-OFFSET: error: '[[FILE]]': section [index 1] has a sh_offset (0x10000) + sh_size (0x0) that is greater than the file size (0x160)
 
 --- !ELF
 FileHeader:
@@ -286,7 +286,7 @@ Sections:
 
 --- !ELF
 FileHeader:
-  Class:   ELFCLASS64
+  Class:   ELFCLASS32
   Data:    ELFDATA2LSB
   Type:    ET_REL
   Machine: EM_386
@@ -375,7 +375,7 @@ Sections:
 # RUN: not llvm-readobj --sections --section-data %t18 2>&1 \
 # RUN:  | FileCheck -DFILE=%t18 --check-prefix=BROKEN-SECSHOFFSET %s
 
-# BROKEN-SECSHOFFSET: error: '[[FILE]]': section [index 1] has a sh_offset (0xffff0000) + sh_size (0x0) that cannot be represented
+# BROKEN-SECSHOFFSET: error: '[[FILE]]': section [index 1] has a sh_offset (0xffff0000) + sh_size (0x0) that is greater than the file size (0x160)
 
 --- !ELF
 FileHeader:
diff --git a/llvm/test/Other/2010-05-06-Printer.ll b/llvm/test/Other/2010-05-06-Printer.ll
index 9e7c9cb6ab4a8..decd977c3d212 100644
--- a/llvm/test/Other/2010-05-06-Printer.ll
+++ b/llvm/test/Other/2010-05-06-Printer.ll
@@ -16,6 +16,5 @@ define void @foo(){
 ;ALL: ModuleID =
 
 ;FOO: IR Dump After
-;FOO-EMPTY:
 ;FOO-NEXT: define void @foo()
 ;FOO-NOT: define void @tester
diff --git a/llvm/test/Other/printer.ll b/llvm/test/Other/printer.ll
index 9785a17b2280a..8633765628550 100644
--- a/llvm/test/Other/printer.ll
+++ b/llvm/test/Other/printer.ll
@@ -1,5 +1,7 @@
-; RUN: opt -mem2reg -instcombine -print-after-all -disable-output < %s 2>&1 | FileCheck %s
-; RUN: opt -passes='mem2reg,instcombine' -print-after-all -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mem2reg -instcombine -print-after-all -disable-output < %s 2>&1 | \
+; RUN:   FileCheck --check-prefixes=CHECK,OLDPM %s --implicit-check-not='IR Dump'
+; RUN: opt -passes='mem2reg,instcombine' -print-after-all -disable-output < %s 2>&1 | \
+; RUN:   FileCheck --check-prefixes=CHECK,NEWPM %s --implicit-check-not='IR Dump'
 define void @tester(){
   ret void
 }
@@ -8,21 +10,14 @@ define void @foo(){
   ret void
 }
 
-;CHECK-NOT: IR Dump After PassManager
-;CHECK-NOT: IR Dump After ModuleToFunctionPassAdaptor
-;
-;CHECK:     *** IR Dump After {{Promote Memory to Register|PromotePass}}
-;CHECK:     define void @tester
-;CHECK-NOT: define void @foo
-;CHECK:     *** IR Dump After {{Combine redundant instructions|InstCombinePass}}
-;CHECK:     define void @tester
-;CHECK-NOT: define void @foo
-;CHECK:     *** IR Dump After {{Promote Memory to Register|PromotePass}}
-;CHECK:     define void @foo
-;CHECK-NOT: define void @tester
-;CHECK:     *** IR Dump After {{Combine redundant instructions|InstCombinePass}}
-;CHECK:     define void @foo
-;CHECK-NOT: define void @tester
-;CHECK:     *** IR Dump After {{Module Verifier|VerifierPass}}
-;
-;CHECK-NOT: IR Dump After Print Module IR
+; NEWPM:      *** IR Dump After VerifierPass
+; CHECK:      *** IR Dump After {{Promote Memory to Register|PromotePass}}
+; CHECK-NEXT: define void @tester
+; CHECK:      *** IR Dump After {{Combine redundant instructions|InstCombinePass}}
+; CHECK-NEXT: define void @tester
+; OLDPM:      *** IR Dump After Module Verifier
+; CHECK:      *** IR Dump After {{Promote Memory to Register|PromotePass}}
+; CHECK-NEXT: define void @foo
+; CHECK:      *** IR Dump After {{Combine redundant instructions|InstCombinePass}}
+; CHECK-NEXT: define void @foo
+; CHECK:      *** IR Dump After {{Module Verifier|VerifierPass}}
diff --git a/llvm/test/ThinLTO/X86/diagnostic-handler-remarks.ll b/llvm/test/ThinLTO/X86/diagnostic-handler-remarks.ll
index b83e93b7a29de..a996b8586eef8 100644
--- a/llvm/test/ThinLTO/X86/diagnostic-handler-remarks.ll
+++ b/llvm/test/ThinLTO/X86/diagnostic-handler-remarks.ll
@@ -51,6 +51,19 @@
 ; YAML2-NEXT:   - String:          ')'
 ; YAML2-NEXT: ...
 
+; The file extension depends on the format of the remarks
+; RUN: rm -f %t.bitstream.thin.0.bitstream %t.bitstream.thin.1.bitstream
+; RUN: llvm-lto -thinlto-action=run \
+; RUN:          -lto-pass-remarks-output=%t.bitstream \
+; RUN:          -lto-pass-remarks-filter=inline \
+; RUN:          -lto-pass-remarks-format=bitstream \
+; RUN:          -exported-symbol _func2 \
+; RUN:          -exported-symbol _main %t1.bc %t2.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty
+; RUN: llvm-bcanalyzer %t.bitstream.thin.0.bitstream
+; RUN: llvm-bcanalyzer %t.bitstream.thin.1.bitstream
+; CHECK-NOT: remark:
+; CHECK-NOT: llvm-lto:
 
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
diff --git a/llvm/test/ThinLTO/X86/index-const-prop2.ll b/llvm/test/ThinLTO/X86/index-const-prop2.ll
index 430c7e8156d2a..928d00adc9a23 100644
--- a/llvm/test/ThinLTO/X86/index-const-prop2.ll
+++ b/llvm/test/ThinLTO/X86/index-const-prop2.ll
@@ -36,6 +36,8 @@
 ; RUN:  -o %t4
 ; RUN: llvm-dis %t4.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT2
 
+; Run again but with main2 exported instead of main to check that write only
+; variables are optimized out.
 ; RUN: llvm-lto2 run %t1.bc %t2.bc -save-temps \
 ; RUN:  -r=%t2.bc,foo,pl \
 ; RUN:  -r=%t2.bc,bar,pl \
@@ -49,7 +51,7 @@
 ; RUN:  -r=%t1.bc,baz, \
 ; RUN:  -r=%t1.bc,gBar, \
 ; RUN:  -o %t5
-; RUN: llvm-dis %t5.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT
+; RUN: llvm-dis %t5.1.3.import.bc -o - | FileCheck %s --check-prefix=IMPORT-WRITEONLY
 ; RUN: llvm-dis %t5.1.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN2
 ; Check that gFoo and gBar were eliminated from source module together
 ; with corresponsing stores
@@ -59,6 +61,10 @@
 ; IMPORT-NEXT:  @gBar = internal local_unnamed_addr global i32 2, align 4
 ; IMPORT:       !DICompileUnit({{.*}})
 
+; Write only variables are imported with a zero initializer.
+; IMPORT-WRITEONLY:  @gFoo.llvm.0 = internal unnamed_addr global i32 0
+; IMPORT-WRITEONLY:  @gBar = internal local_unnamed_addr global i32 0
+
 ; CODEGEN:        i32 @main()
 ; CODEGEN-NEXT:     ret i32 3
 
diff --git a/llvm/test/ThinLTO/X86/writeonly-with-refs.ll b/llvm/test/ThinLTO/X86/writeonly-with-refs.ll
index 63f75762c39b1..787d977582211 100644
--- a/llvm/test/ThinLTO/X86/writeonly-with-refs.ll
+++ b/llvm/test/ThinLTO/X86/writeonly-with-refs.ll
@@ -7,10 +7,22 @@
 ; RUN:    -r=%t2,outer,pl
 
 ; @outer should have been internalized and converted to zeroinitilizer.
-; RUN: llvm-dis %t-out.1.5.precodegen.bc -o - | FileCheck %s
-; RUN: llvm-dis %t-out.2.5.precodegen.bc -o - | FileCheck %s
+; RUN: llvm-dis %t-out.1.3.import.bc -o - | FileCheck %s
+; RUN: llvm-dis %t-out.2.3.import.bc -o - | FileCheck %s
 
-; CHECK: @outer = internal unnamed_addr global %struct.Q zeroinitializer
+; CHECK: @outer = internal local_unnamed_addr global %struct.Q zeroinitializer
+
+; Test again in distributed ThinLTO mode.
+; RUN: llvm-lto2 run -save-temps %t1 %t2 -o %t-out \
+; RUN:    -thinlto-distributed-indexes \
+; RUN:    -r=%t1,main,plx \
+; RUN:    -r=%t1,_Z3foov,l \
+; RUN:    -r=%t2,_Z3foov,pl \
+; RUN:    -r=%t2,outer,pl
+; RUN: opt -function-import -import-all-index -enable-import-metadata -summary-file %t1.thinlto.bc %t1 -o %t1.out
+; RUN: opt -function-import -import-all-index -summary-file %t2.thinlto.bc %t2 -o %t2.out
+; RUN: llvm-dis %t1.out -o - | FileCheck %s
+; RUN: llvm-dis %t2.out -o - | FileCheck %s
 
 source_filename = "main.cpp"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/ThinLTO/X86/writeonly.ll b/llvm/test/ThinLTO/X86/writeonly.ll
index 20f4533efe686..27305e160ea71 100644
--- a/llvm/test/ThinLTO/X86/writeonly.ll
+++ b/llvm/test/ThinLTO/X86/writeonly.ll
@@ -11,8 +11,8 @@
 ; RUN: llvm-dis %t1.imported.bc -o - | FileCheck %s --check-prefix=IMPORT
 ; RUN: llvm-lto -thinlto-action=optimize %t1.imported.bc -o - | llvm-dis - -o - | FileCheck %s --check-prefix=OPTIMIZE
 
-; IMPORT: @gFoo.llvm.0 = internal unnamed_addr global i32 1, align 4, !dbg !0
-; IMPORT-NEXT: @gBar = internal local_unnamed_addr global i32 2, align 4, !dbg !5
+; IMPORT: @gFoo.llvm.0 = internal unnamed_addr global i32 0, align 4, !dbg !0
+; IMPORT-NEXT: @gBar = internal local_unnamed_addr global i32 0, align 4, !dbg !5
 ; IMPORT: !DICompileUnit({{.*}})
 
 ; STATS:  2 module-summary-index - Number of live global variables marked write only 
diff --git a/llvm/test/ThinLTO/X86/writeonly2.ll b/llvm/test/ThinLTO/X86/writeonly2.ll
index a7383f25b4822..2648727f09971 100644
--- a/llvm/test/ThinLTO/X86/writeonly2.ll
+++ b/llvm/test/ThinLTO/X86/writeonly2.ll
@@ -19,8 +19,8 @@
 ; with corresponsing stores
 ; RUN: llvm-dis %t3.2.5.precodegen.bc -o - | FileCheck %s --check-prefix=CODEGEN-SRC
 
-; IMPORT:       @gFoo.llvm.0 = internal unnamed_addr global i32 1, align 4
-; IMPORT-NEXT:  @gBar = internal local_unnamed_addr global i32 2, align 4
+; IMPORT:       @gFoo.llvm.0 = internal unnamed_addr global i32 0, align 4
+; IMPORT-NEXT:  @gBar = internal local_unnamed_addr global i32 0, align 4
 ; IMPORT:       !DICompileUnit({{.*}})
 
 ; CODEGEN-NOT:  gFoo
diff --git a/llvm/test/Transforms/FunctionAttrs/align.ll b/llvm/test/Transforms/Attributor/align.ll
similarity index 85%
rename from llvm/test/Transforms/FunctionAttrs/align.ll
rename to llvm/test/Transforms/Attributor/align.ll
index da7bd1b5cc9a8..a5bf91915baf8 100644
--- a/llvm/test/Transforms/FunctionAttrs/align.ll
+++ b/llvm/test/Transforms/Attributor/align.ll
@@ -337,5 +337,64 @@ define i64 @test11(i32* %p) {
   %ret = load i64, i64* %p-cast, align 8
   ret i64 %ret
 }
+
+; TEST 12
+; Test for deduction using must-be-executed-context and GEP instruction 
+
+; FXIME: %p should have nonnull
+; ATTRIBUTOR: define i64 @test12-1(i32* nocapture nofree readonly align 16 %p)
+define i64 @test12-1(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 1
+  %arrayidx1 = getelementptr i64, i64* %arrayidx0, i64 3
+  %ret = load i64, i64* %arrayidx1, align 16
+  ret i64 %ret
+}
+
+; ATTRIBUTOR: define i64 @test12-2(i32* nocapture nofree nonnull readonly align 16 dereferenceable(8) %p)
+define i64 @test12-2(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 0
+  %ret = load i64, i64* %arrayidx0, align 16
+  ret i64 %ret
+}
+
+; FXIME: %p should have nonnull
+; ATTRIBUTOR: define void @test12-3(i32* nocapture nofree writeonly align 16 %p)
+define void @test12-3(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 1
+  %arrayidx1 = getelementptr i64, i64* %arrayidx0, i64 3
+  store i64 0, i64* %arrayidx1, align 16
+  ret void 
+}
+
+; ATTRIBUTOR: define void @test12-4(i32* nocapture nofree nonnull writeonly align 16 dereferenceable(8) %p)
+define void @test12-4(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 0
+  store i64 0, i64* %arrayidx0, align 16
+  ret void 
+}
+
+declare void @use(i64*) willreturn nounwind
+
+; ATTRIBUTOR: define void @test12-5(i32* align 16 %p)
+define void @test12-5(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 1
+  %arrayidx1 = getelementptr i64, i64* %arrayidx0, i64 3
+  tail call void @use(i64* align 16 %arrayidx1)
+  ret void 
+}
+
+; ATTRIBUTOR: define void @test12-6(i32* align 16 %p)
+define void @test12-6(i32* align 4 %p) {
+  %p-cast = bitcast i32* %p to i64*
+  %arrayidx0 = getelementptr i64, i64* %p-cast, i64 0
+  tail call void @use(i64* align 16 %arrayidx0)
+  ret void 
+}
+
 attributes #0 = { nounwind uwtable noinline }
 attributes #1 = { uwtable noinline }
diff --git a/llvm/test/Transforms/FunctionAttrs/callbacks.ll b/llvm/test/Transforms/Attributor/callbacks.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/callbacks.ll
rename to llvm/test/Transforms/Attributor/callbacks.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/dereferenceable.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
similarity index 96%
rename from llvm/test/Transforms/FunctionAttrs/dereferenceable.ll
rename to llvm/test/Transforms/Attributor/dereferenceable-1.ll
index 1c285fa288370..951b5047747f0 100644
--- a/llvm/test/Transforms/FunctionAttrs/dereferenceable.ll
+++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll
@@ -30,8 +30,7 @@ define i32* @test3_1(i32* dereferenceable(8) %0) local_unnamed_addr {
 }
 
 define i32* @test3_2(i32* dereferenceable_or_null(32) %0) local_unnamed_addr {
-; FIXME: We should not have both deref(x) and deref_or_null(y) with x >= y.
-; ATTRIBUTOR: define nonnull dereferenceable(16) i32* @test3_2(i32* nofree nonnull readnone dereferenceable(32) dereferenceable_or_null(32) "no-capture-maybe-returned" %0)
+; ATTRIBUTOR: define nonnull dereferenceable(16) i32* @test3_2(i32* nofree nonnull readnone dereferenceable(32) "no-capture-maybe-returned" %0)
   %ret = getelementptr inbounds i32, i32* %0, i64 4
   ret i32* %ret
 }
@@ -202,3 +201,9 @@ define i32* @test_for_minus_index(i32* %p) {
   store i32 1, i32* %q
   ret i32* %q
 }
+
+define void @deref_or_null_and_nonnull(i32* dereferenceable_or_null(100) %0) {
+; ATTRIBUTOR: define void @deref_or_null_and_nonnull(i32* nocapture nofree nonnull writeonly dereferenceable(100) %0)
+  store i32 1, i32* %0
+  ret void
+}
diff --git a/llvm/test/Transforms/Attributor/dereferenceable-2.ll b/llvm/test/Transforms/Attributor/dereferenceable-2.ll
new file mode 100644
index 0000000000000..b3c0440f930f4
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/dereferenceable-2.ll
@@ -0,0 +1,356 @@
+; RUN: opt < %s -attributor --attributor-disable=false -S | FileCheck %s --check-prefix=ATTRIBUTOR
+; Copied from Transforms/InferFunctionAttrs/dereferenceable.ll
+
+; Determine dereference-ability before unused loads get deleted:
+; https://bugs.llvm.org/show_bug.cgi?id=21780
+
+define <4 x double> @PR21780(double* %ptr) {
+; ATTRIBUTOR-LABEL: @PR21780(double* nocapture nofree nonnull readonly align 8 dereferenceable(32) %ptr)
+
+  ; GEP of index 0 is simplified away.
+  %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr inbounds double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  %vecinit0 = insertelement <4 x double> undef, double %t0, i32 0
+  %vecinit1 = insertelement <4 x double> %vecinit0, double %t1, i32 1
+  %vecinit2 = insertelement <4 x double> %vecinit1, double %t2, i32 2
+  %vecinit3 = insertelement <4 x double> %vecinit2, double %t3, i32 3
+  %shuffle = shufflevector <4 x double> %vecinit3, <4 x double> %vecinit3, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+
+define double @PR21780_only_access3_with_inbounds(double* %ptr) {
+; ATTRIBUTOR-LABEL: @PR21780_only_access3_with_inbounds(double* nocapture nofree nonnull readonly align 8 dereferenceable(32) %ptr)
+
+  %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_only_access3_without_inbounds(double* %ptr) {
+; ATTRIBUTOR-LABEL: @PR21780_only_access3_without_inbounds(double* nocapture nofree readonly align 8 %ptr)
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+  %t3 = load double, double* %arrayidx3, align 8
+  ret double %t3
+}
+
+define double @PR21780_without_inbounds(double* %ptr) {
+; ATTRIBUTOR-LABEL: @PR21780_without_inbounds(double* nocapture nofree nonnull readonly align 8 dereferenceable(32) %ptr)
+
+  %arrayidx1 = getelementptr double, double* %ptr, i64 1
+  %arrayidx2 = getelementptr double, double* %ptr, i64 2
+  %arrayidx3 = getelementptr double, double* %ptr, i64 3
+
+  %t0 = load double, double* %ptr, align 8
+  %t1 = load double, double* %arrayidx1, align 8
+  %t2 = load double, double* %arrayidx2, align 8
+  %t3 = load double, double* %arrayidx3, align 8
+
+  ret double %t3
+}
+
+; Unsimplified, but still valid. Also, throw in some bogus arguments.
+
+define void @gep0(i8* %unused, i8* %other, i8* %ptr) {
+; ATTRIBUTOR-LABEL: @gep0(i8* nocapture nofree readnone %unused, i8* nocapture nofree nonnull writeonly dereferenceable(1) %other, i8* nocapture nofree nonnull readonly dereferenceable(3) %ptr)
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  store i8 %t2, i8* %other
+  ret void
+}
+
+; Order of accesses does not change computation.
+; Multiple arguments may be dereferenceable.
+
+define void @ordering(i8* %ptr1, i32* %ptr2) {
+; ATTRIBUTOR-LABEL: @ordering(i8* nocapture nofree nonnull readonly dereferenceable(3) %ptr1, i32* nocapture nofree nonnull readonly dereferenceable(8) %ptr2)
+  %a20 = getelementptr i32, i32* %ptr2, i64 0
+  %a12 = getelementptr i8, i8* %ptr1, i64 2
+  %t12 = load i8, i8* %a12
+  %a11 = getelementptr i8, i8* %ptr1, i64 1
+  %t20 = load i32, i32* %a20
+  %a10 = getelementptr i8, i8* %ptr1, i64 0
+  %t10 = load i8, i8* %a10
+  %t11 = load i8, i8* %a11
+  %a21 = getelementptr i32, i32* %ptr2, i64 1
+  %t21 = load i32, i32* %a21
+  ret void
+}
+
+; Not in entry block.
+
+define void @not_entry_but_guaranteed_to_execute(i8* %ptr) {
+; ATTRIBUTOR-LABEL: @not_entry_but_guaranteed_to_execute(i8* nocapture nofree nonnull readonly dereferenceable(3) %ptr)
+entry:
+  br label %exit
+exit:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Not in entry block and not guaranteed to execute.
+
+define void @not_entry_not_guaranteed_to_execute(i8* %ptr, i1 %cond) {
+; ATTRIBUTOR-LABEL: @not_entry_not_guaranteed_to_execute(i8* nocapture nofree readonly %ptr, i1 %cond)
+entry:
+  br i1 %cond, label %loads, label %exit
+loads:
+  %arrayidx0 = getelementptr i8, i8* %ptr, i64 0
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 1
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %arrayidx0
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The last load may not execute, so derefenceable bytes only covers the 1st two loads.
+
+define void @partial_in_entry(i16* %ptr, i1 %cond) {
+; ATTRIBUTOR-LABEL: @partial_in_entry(i16* nocapture nofree nonnull readonly dereferenceable(4) %ptr, i1 %cond)
+entry:
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  br i1 %cond, label %loads, label %exit
+loads:
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+exit:
+  ret void
+}
+
+; The volatile load can't be used to prove a non-volatile access is allowed.
+; The 2nd and 3rd loads may never execute.
+
+define void @volatile_is_not_dereferenceable(i16* %ptr) {
+; ATTRIBUTOR-LABEL: @volatile_is_not_dereferenceable(i16* nofree %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load volatile i16, i16* %arrayidx0
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; TODO: We should allow inference for atomic (but not volatile) ops.
+
+define void @atomic_is_alright(i16* %ptr) {
+; ATTRIBUTOR-LABEL: @atomic_is_alright(i16* nocapture nofree nonnull readonly align 2 dereferenceable(6) %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load atomic i16, i16* %arrayidx0 unordered, align 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+declare void @may_not_return()
+
+define void @not_guaranteed_to_transfer_execution(i16* %ptr) {
+; ATTRIBUTOR-LABEL: @not_guaranteed_to_transfer_execution(i16* nocapture nonnull readonly dereferenceable(2) %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load i16, i16* %arrayidx0
+  call void @may_not_return()
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; We must have consecutive accesses.
+
+define void @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) {
+; ATTRIBUTOR-LABEL: @variable_gep_index(i8* nocapture nofree readnone %unused, i8* nocapture nofree nonnull readonly dereferenceable(1) %ptr, i64 %variable_index)
+  %arrayidx1 = getelementptr i8, i8* %ptr, i64 %variable_index
+  %arrayidx2 = getelementptr i8, i8* %ptr, i64 2
+  %t0 = load i8, i8* %ptr
+  %t1 = load i8, i8* %arrayidx1
+  %t2 = load i8, i8* %arrayidx2
+  ret void
+}
+
+; Deal with >1 GEP index.
+
+define void @multi_index_gep(<4 x i8>* %ptr) {
+; FIXME: %ptr should be dereferenceable(4)
+; ATTRIBUTOR-LABEL: @multi_index_gep(<4 x i8>* nocapture nofree nonnull readonly dereferenceable(1) %ptr)
+  %arrayidx00 = getelementptr <4 x i8>, <4 x i8>* %ptr, i64 0, i64 0
+  %t0 = load i8, i8* %arrayidx00
+  ret void
+}
+
+; Could round weird bitwidths down?
+
+define void @not_byte_multiple(i9* %ptr) {
+; ATTRIBUTOR-LABEL: @not_byte_multiple(i9* nocapture nofree nonnull readonly dereferenceable(2) %ptr) 
+  %arrayidx0 = getelementptr i9, i9* %ptr, i64 0
+  %t0 = load i9, i9* %arrayidx0
+  ret void
+}
+
+; Missing direct access from the pointer.
+
+define void @no_pointer_deref(i16* %ptr) {
+; ATTRIBUTOR-LABEL: @no_pointer_deref(i16* nocapture nofree readonly %ptr)
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
+; Out-of-order is ok, but missing access concludes dereferenceable range.
+
+define void @non_consecutive(i32* %ptr) {
+; ATTRIBUTOR-LABEL: @non_consecutive(i32* nocapture nofree nonnull readonly dereferenceable(8) %ptr)
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %t1 = load i32, i32* %arrayidx1
+  %t0 = load i32, i32* %arrayidx0
+  %t3 = load i32, i32* %arrayidx3
+  ret void
+}
+
+; Improve on existing dereferenceable attribute.
+
+define void @more_bytes(i32* dereferenceable(8) %ptr) {
+; ATTRIBUTOR-LABEL: @more_bytes(i32* nocapture nofree nonnull readonly dereferenceable(16) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; Improve on existing dereferenceable_or_null attribute.
+
+define void @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr) {
+; ATTRIBUTOR-LABEL: @more_bytes_and_not_null(i32* nocapture nofree nonnull readonly dereferenceable(16) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+; But don't pessimize existing dereferenceable attribute.
+
+define void @better_bytes(i32* dereferenceable(100) %ptr) {
+; ATTRIBUTOR-LABEL: @better_bytes(i32* nocapture nofree nonnull readonly dereferenceable(100) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
+define void @bitcast(i32* %arg) {
+; ATTRIBUTOR-LABEL: @bitcast(i32* nocapture nofree nonnull readonly dereferenceable(8) %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @bitcast_different_sizes(double* %arg1, i8* %arg2) {
+; ATTRIBUTOR-LABEL: @bitcast_different_sizes(double* nocapture nofree nonnull readonly dereferenceable(12) %arg1, i8* nocapture nofree nonnull readonly dereferenceable(16) %arg2)
+  %ptr1 = bitcast double* %arg1 to float*
+  %a10 = getelementptr float, float* %ptr1, i64 0
+  %a11 = getelementptr float, float* %ptr1, i64 1
+  %a12 = getelementptr float, float* %ptr1, i64 2
+  %ld10 = load float, float* %a10
+  %ld11 = load float, float* %a11
+  %ld12 = load float, float* %a12
+
+  %ptr2 = bitcast i8* %arg2 to i64*
+  %a20 = getelementptr i64, i64* %ptr2, i64 0
+  %a21 = getelementptr i64, i64* %ptr2, i64 1
+  %ld20 = load i64, i64* %a20
+  %ld21 = load i64, i64* %a21
+  ret void
+}
+
+define void @negative_offset(i32* %arg) {
+; ATTRIBUTOR-LABEL: @negative_offset(i32* nocapture nofree nonnull readonly dereferenceable(4) %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 -1
+  %t0 = load float, float* %arrayidx0
+  %t1 = load float, float* %arrayidx1
+  ret void
+}
+
+define void @stores(i32* %arg) {
+; ATTRIBUTOR-LABEL: @stores(i32* nocapture nofree nonnull writeonly dereferenceable(8) %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  store float 1.0, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @load_store(i32* %arg) {
+; ATTRIBUTOR-LABEL: @load_store(i32* nocapture nofree nonnull dereferenceable(8) %arg)
+  %ptr = bitcast i32* %arg to float*
+  %arrayidx0 = getelementptr float, float* %ptr, i64 0
+  %arrayidx1 = getelementptr float, float* %ptr, i64 1
+  %t1 = load float, float* %arrayidx0
+  store float 2.0, float* %arrayidx1
+  ret void
+}
+
+define void @different_size1(i32* %arg) {
+; ATTRIBUTOR-LABEL: @different_size1(i32* nocapture nofree nonnull writeonly dereferenceable(8) %arg)
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  store i32 0, i32* %arg
+  ret void
+}
+
+define void @different_size2(i32* %arg) {
+; ATTRIBUTOR-LABEL: @different_size2(i32* nocapture nofree nonnull writeonly dereferenceable(8) %arg)
+  store i32 0, i32* %arg
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/heap_to_stack.ll b/llvm/test/Transforms/Attributor/heap_to_stack.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/heap_to_stack.ll
rename to llvm/test/Transforms/Attributor/heap_to_stack.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/internal-noalias.ll b/llvm/test/Transforms/Attributor/internal-noalias.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/internal-noalias.ll
rename to llvm/test/Transforms/Attributor/internal-noalias.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/liveness.ll
rename to llvm/test/Transforms/Attributor/liveness.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/misc.ll b/llvm/test/Transforms/Attributor/misc.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/misc.ll
rename to llvm/test/Transforms/Attributor/misc.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/new_attributes.ll b/llvm/test/Transforms/Attributor/new_attributes.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/new_attributes.ll
rename to llvm/test/Transforms/Attributor/new_attributes.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/noalias_returned.ll b/llvm/test/Transforms/Attributor/noalias.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/noalias_returned.ll
rename to llvm/test/Transforms/Attributor/noalias.ll
diff --git a/llvm/test/Transforms/Attributor/nocapture-1.ll b/llvm/test/Transforms/Attributor/nocapture-1.ll
new file mode 100644
index 0000000000000..abb148d883ed4
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/nocapture-1.ll
@@ -0,0 +1,346 @@
+; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -S -attributor-annotate-decl-cs < %s | FileCheck %s --check-prefixes=ATTRIBUTOR
+; RUN: opt -passes=attributor -attributor-manifest-internal -attributor-disable=false -S -attributor-annotate-decl-cs < %s | FileCheck %s --check-prefixes=ATTRIBUTOR
+; Copied from Transforms/FunctoinAttrs/nocapture.ll
+
+@g = global i32* null		; <i32**> [#uses=1]
+
+; ATTRIBUTOR: define i32* @c1(i32* nofree readnone returned "no-capture-maybe-returned" %q)
+define i32* @c1(i32* %q) {
+	ret i32* %q
+}
+
+; ATTRIBUTOR: define void @c2(i32* nofree writeonly %q)
+; It would also be acceptable to mark %q as readnone. Update @c3 too.
+define void @c2(i32* %q) {
+	store i32* %q, i32** @g
+	ret void
+}
+
+; ATTRIBUTOR: define void @c3(i32* nofree writeonly %q)
+define void @c3(i32* %q) {
+	call void @c2(i32* %q)
+	ret void
+}
+
+; ATTRIBUTOR: define i1 @c4(i32* nofree readnone %q, i32 %bitno)
+define i1 @c4(i32* %q, i32 %bitno) {
+	%tmp = ptrtoint i32* %q to i32
+	%tmp2 = lshr i32 %tmp, %bitno
+	%bit = trunc i32 %tmp2 to i1
+	br i1 %bit, label %l1, label %l0
+l0:
+	ret i1 0 ; escaping value not caught by def-use chaining.
+l1:
+	ret i1 1 ; escaping value not caught by def-use chaining.
+}
+
+; c4b is c4 but without the escaping part
+; ATTRIBUTOR: define i1 @c4b(i32* nocapture nofree readnone %q, i32 %bitno)
+define i1 @c4b(i32* %q, i32 %bitno) {
+	%tmp = ptrtoint i32* %q to i32
+	%tmp2 = lshr i32 %tmp, %bitno
+	%bit = trunc i32 %tmp2 to i1
+	br i1 %bit, label %l1, label %l0
+l0:
+	ret i1 0 ; not escaping!
+l1:
+	ret i1 0 ; not escaping!
+}
+
+@lookup_table = global [2 x i1] [ i1 0, i1 1 ]
+
+; ATTRIBUTOR: define i1 @c5(i32* nofree readonly %q, i32 %bitno)
+define i1 @c5(i32* %q, i32 %bitno) {
+	%tmp = ptrtoint i32* %q to i32
+	%tmp2 = lshr i32 %tmp, %bitno
+	%bit = and i32 %tmp2, 1
+        ; subtle escape mechanism follows
+	%lookup = getelementptr [2 x i1], [2 x i1]* @lookup_table, i32 0, i32 %bit
+	%val = load i1, i1* %lookup
+	ret i1 %val
+}
+
+declare void @throw_if_bit_set(i8*, i8) readonly
+
+; ATTRIBUTOR: define i1 @c6(i8* readonly %q, i8 %bit)
+define i1 @c6(i8* %q, i8 %bit) personality i32 (...)* @__gxx_personality_v0 {
+	invoke void @throw_if_bit_set(i8* %q, i8 %bit)
+		to label %ret0 unwind label %ret1
+ret0:
+	ret i1 0
+ret1:
+        %exn = landingpad {i8*, i32}
+                 cleanup
+	ret i1 1
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+define i1* @lookup_bit(i32* %q, i32 %bitno) readnone nounwind {
+	%tmp = ptrtoint i32* %q to i32
+	%tmp2 = lshr i32 %tmp, %bitno
+	%bit = and i32 %tmp2, 1
+	%lookup = getelementptr [2 x i1], [2 x i1]* @lookup_table, i32 0, i32 %bit
+	ret i1* %lookup
+}
+
+; ATTRIBUTOR: define i1 @c7(i32* nofree readonly %q, i32 %bitno)
+define i1 @c7(i32* %q, i32 %bitno) {
+	%ptr = call i1* @lookup_bit(i32* %q, i32 %bitno)
+	%val = load i1, i1* %ptr
+	ret i1 %val
+}
+
+
+; ATTRIBUTOR: define i32 @nc1(i32* nofree %q, i32* nocapture nofree %p, i1 %b)
+define i32 @nc1(i32* %q, i32* %p, i1 %b) {
+e:
+	br label %l
+l:
+	%x = phi i32* [ %p, %e ]
+	%y = phi i32* [ %q, %e ]
+	%tmp = bitcast i32* %x to i32*		; <i32*> [#uses=2]
+	%tmp2 = select i1 %b, i32* %tmp, i32* %y
+	%val = load i32, i32* %tmp2		; <i32> [#uses=1]
+	store i32 0, i32* %tmp
+	store i32* %y, i32** @g
+	ret i32 %val
+}
+
+; ATTRIBUTOR: define i32 @nc1_addrspace(i32* nofree %q, i32 addrspace(1)* nocapture nofree %p, i1 %b)
+define i32 @nc1_addrspace(i32* %q, i32 addrspace(1)* %p, i1 %b) {
+e:
+	br label %l
+l:
+	%x = phi i32 addrspace(1)* [ %p, %e ]
+	%y = phi i32* [ %q, %e ]
+	%tmp = addrspacecast i32 addrspace(1)* %x to i32*		; <i32*> [#uses=2]
+	%tmp2 = select i1 %b, i32* %tmp, i32* %y
+	%val = load i32, i32* %tmp2		; <i32> [#uses=1]
+	store i32 0, i32* %tmp
+	store i32* %y, i32** @g
+	ret i32 %val
+}
+
+; ATTRIBUTOR: define void @nc2(i32* nocapture nofree %p, i32* nofree %q)
+define void @nc2(i32* %p, i32* %q) {
+	%1 = call i32 @nc1(i32* %q, i32* %p, i1 0)		; <i32> [#uses=0]
+	ret void
+}
+
+
+; ATTRIBUTOR: define void @nc3(void ()* nocapture nofree nonnull %p)
+define void @nc3(void ()* %p) {
+	call void %p()
+	ret void
+}
+
+declare void @external(i8*) readonly nounwind
+; ATTRIBUTOR: define void @nc4(i8* nocapture readonly %p)
+define void @nc4(i8* %p) {
+	call void @external(i8* %p)
+	ret void
+}
+
+; ATTRIBUTOR: define void @nc5(void (i8*)* nocapture nofree nonnull %f, i8* nocapture %p)
+define void @nc5(void (i8*)* %f, i8* %p) {
+	call void %f(i8* %p) readonly nounwind
+	call void %f(i8* nocapture %p)
+	ret void
+}
+
+; ATTRIBUTOR: define void @test1_1(i8* nocapture nofree readnone %x1_1, i8* nocapture nofree readnone %y1_1, i1 %c)
+; It would be acceptable to add readnone to %y1_1 and %y1_2.
+define void @test1_1(i8* %x1_1, i8* %y1_1, i1 %c) {
+  call i8* @test1_2(i8* %x1_1, i8* %y1_1, i1 %c)
+  store i32* null, i32** @g
+  ret void
+}
+
+; ATTRIBUTOR: define i8* @test1_2(i8* nocapture nofree readnone %x1_2, i8* nofree readnone returned "no-capture-maybe-returned" %y1_2, i1 %c)
+define i8* @test1_2(i8* %x1_2, i8* %y1_2, i1 %c) {
+  br i1 %c, label %t, label %f
+t:
+  call void @test1_1(i8* %x1_2, i8* %y1_2, i1 %c)
+  store i32* null, i32** @g
+  br label %f
+f:
+  ret i8* %y1_2
+}
+
+; ATTRIBUTOR: define void @test2(i8* nocapture nofree readnone %x2)
+define void @test2(i8* %x2) {
+  call void @test2(i8* %x2)
+  store i32* null, i32** @g
+  ret void
+}
+
+; ATTRIBUTOR: define void @test3(i8* nocapture nofree readnone %x3, i8* nocapture nofree readnone %y3, i8* nocapture nofree readnone %z3)
+define void @test3(i8* %x3, i8* %y3, i8* %z3) {
+  call void @test3(i8* %z3, i8* %y3, i8* %x3)
+  store i32* null, i32** @g
+  ret void
+}
+
+; ATTRIBUTOR: define void @test4_1(i8* nocapture nofree readnone %x4_1, i1 %c)
+define void @test4_1(i8* %x4_1, i1 %c) {
+  call i8* @test4_2(i8* %x4_1, i8* %x4_1, i8* %x4_1, i1 %c)
+  store i32* null, i32** @g
+  ret void
+}
+
+; ATTRIBUTOR: define i8* @test4_2(i8* nocapture nofree readnone %x4_2, i8* nofree readnone returned "no-capture-maybe-returned" %y4_2, i8* nocapture nofree readnone %z4_2, i1 %c)
+define i8* @test4_2(i8* %x4_2, i8* %y4_2, i8* %z4_2, i1 %c) {
+  br i1 %c, label %t, label %f
+t:
+  call void @test4_1(i8* null, i1 %c)
+  store i32* null, i32** @g
+  br label %f
+f:
+  ret i8* %y4_2
+}
+
+declare i8* @test5_1(i8* %x5_1)
+
+; ATTRIBUTOR: define void @test5_2(i8* %x5_2)
+define void @test5_2(i8* %x5_2) {
+  call i8* @test5_1(i8* %x5_2)
+  store i32* null, i32** @g
+  ret void
+}
+
+declare void @test6_1(i8* %x6_1, i8* nocapture %y6_1, ...)
+
+; ATTRIBUTOR: define void @test6_2(i8* %x6_2, i8* nocapture %y6_2, i8* %z6_2)
+define void @test6_2(i8* %x6_2, i8* %y6_2, i8* %z6_2) {
+  call void (i8*, i8*, ...) @test6_1(i8* %x6_2, i8* %y6_2, i8* %z6_2)
+  store i32* null, i32** @g
+  ret void
+}
+
+; ATTRIBUTOR: define void @test_cmpxchg(i32* nocapture nofree nonnull dereferenceable(4) %p)
+define void @test_cmpxchg(i32* %p) {
+  cmpxchg i32* %p, i32 0, i32 1 acquire monotonic
+  ret void
+}
+
+; ATTRIBUTOR: define void @test_cmpxchg_ptr(i32** nocapture nofree nonnull dereferenceable(8) %p, i32* nofree %q)
+define void @test_cmpxchg_ptr(i32** %p, i32* %q) {
+  cmpxchg i32** %p, i32* null, i32* %q acquire monotonic
+  ret void
+}
+
+; ATTRIBUTOR: define void @test_atomicrmw(i32* nocapture nofree nonnull dereferenceable(4) %p)
+define void @test_atomicrmw(i32* %p) {
+  atomicrmw add i32* %p, i32 1 seq_cst
+  ret void
+}
+
+; ATTRIBUTOR: define void @test_volatile(i32* nofree align 4 %x)
+define void @test_volatile(i32* %x) {
+entry:
+  %gep = getelementptr i32, i32* %x, i64 1
+  store volatile i32 0, i32* %gep, align 4
+  ret void
+}
+
+; ATTRIBUTOR: nocaptureLaunder(i8* nocapture %p)
+define void @nocaptureLaunder(i8* %p) {
+entry:
+  %b = call i8* @llvm.launder.invariant.group.p0i8(i8* %p)
+  store i8 42, i8* %b
+  ret void
+}
+
+@g2 = global i8* null
+; ATTRIBUTOR: define void @captureLaunder(i8* %p)
+define void @captureLaunder(i8* %p) {
+  %b = call i8* @llvm.launder.invariant.group.p0i8(i8* %p)
+  store i8* %b, i8** @g2
+  ret void
+}
+
+; ATTRIBUTOR: @nocaptureStrip(i8* nocapture writeonly %p)
+define void @nocaptureStrip(i8* %p) {
+entry:
+  %b = call i8* @llvm.strip.invariant.group.p0i8(i8* %p)
+  store i8 42, i8* %b
+  ret void
+}
+
+@g3 = global i8* null
+; ATTRIBUTOR: define void @captureStrip(i8* writeonly %p)
+define void @captureStrip(i8* %p) {
+  %b = call i8* @llvm.strip.invariant.group.p0i8(i8* %p)
+  store i8* %b, i8** @g3
+  ret void
+}
+
+; ATTRIBUTOR: define i1 @captureICmp(i32* nofree readnone %x)
+define i1 @captureICmp(i32* %x) {
+  %1 = icmp eq i32* %x, null
+  ret i1 %1
+}
+
+; ATTRIBUTOR: define i1 @captureICmpRev(i32* nofree readnone %x)
+define i1 @captureICmpRev(i32* %x) {
+  %1 = icmp eq i32* null, %x
+  ret i1 %1
+}
+
+; ATTRIBUTOR: define i1 @nocaptureInboundsGEPICmp(i32* nocapture nofree nonnull readnone %x)
+define i1 @nocaptureInboundsGEPICmp(i32* %x) {
+  %1 = getelementptr inbounds i32, i32* %x, i32 5
+  %2 = bitcast i32* %1 to i8*
+  %3 = icmp eq i8* %2, null
+  ret i1 %3
+}
+
+; ATTRIBUTOR: define i1 @nocaptureInboundsGEPICmpRev(i32* nocapture nofree nonnull readnone %x)
+define i1 @nocaptureInboundsGEPICmpRev(i32* %x) {
+  %1 = getelementptr inbounds i32, i32* %x, i32 5
+  %2 = bitcast i32* %1 to i8*
+  %3 = icmp eq i8* null, %2
+  ret i1 %3
+}
+
+; ATTRIBUTOR: define i1 @nocaptureDereferenceableOrNullICmp(i32* nocapture nofree readnone dereferenceable_or_null(4) %x)
+define i1 @nocaptureDereferenceableOrNullICmp(i32* dereferenceable_or_null(4) %x) {
+  %1 = bitcast i32* %x to i8*
+  %2 = icmp eq i8* %1, null
+  ret i1 %2
+}
+
+; ATTRIBUTOR: define i1 @captureDereferenceableOrNullICmp(i32* nofree readnone dereferenceable_or_null(4) %x)
+define i1 @captureDereferenceableOrNullICmp(i32* dereferenceable_or_null(4) %x) "null-pointer-is-valid"="true" {
+  %1 = bitcast i32* %x to i8*
+  %2 = icmp eq i8* %1, null
+  ret i1 %2
+}
+
+declare void @unknown(i8*)
+define void @test_callsite() {
+entry:
+; We know that 'null' in AS 0 does not alias anything and cannot be captured. Though the latter is not qurried -> derived atm.
+; ATTRIBUTOR: call void @unknown(i8* noalias null)
+  call void @unknown(i8* null)
+  ret void
+}
+
+declare i8* @unknownpi8pi8(i8*,i8* returned)
+define i8* @test_returned1(i8* %A, i8* returned %B) nounwind readonly {
+; ATTRIBUTOR: define i8* @test_returned1(i8* nocapture readonly %A, i8* readonly returned %B)
+entry:
+  %p = call i8* @unknownpi8pi8(i8* %A, i8* %B)
+  ret i8* %p
+}
+
+define i8* @test_returned2(i8* %A, i8* %B) {
+; ATTRIBUTOR: define i8* @test_returned2(i8* nocapture readonly %A, i8* readonly returned %B)
+entry:
+  %p = call i8* @unknownpi8pi8(i8* %A, i8* %B) nounwind readonly
+  ret i8* %p
+}
+
+declare i8* @llvm.launder.invariant.group.p0i8(i8*)
+declare i8* @llvm.strip.invariant.group.p0i8(i8*)
diff --git a/llvm/test/Transforms/FunctionAttrs/arg_nocapture.ll b/llvm/test/Transforms/Attributor/nocapture-2.ll
similarity index 99%
rename from llvm/test/Transforms/FunctionAttrs/arg_nocapture.ll
rename to llvm/test/Transforms/Attributor/nocapture-2.ll
index fa4d984e931b0..79075268ed410 100644
--- a/llvm/test/Transforms/FunctionAttrs/arg_nocapture.ll
+++ b/llvm/test/Transforms/Attributor/nocapture-2.ll
@@ -260,8 +260,7 @@ entry:
 ; }
 ;
 ; There should *not* be a no-capture attribute on %a
-; FIXME: %a should have align 8
-; CHECK: define nonnull align 8 dereferenceable(8) i64* @not_captured_but_returned_1(i64* nofree nonnull writeonly dereferenceable(16) "no-capture-maybe-returned" %a)
+; CHECK: define nonnull align 8 dereferenceable(8) i64* @not_captured_but_returned_1(i64* nofree nonnull writeonly align 8 dereferenceable(16) "no-capture-maybe-returned" %a)
 define i64* @not_captured_but_returned_1(i64* %a) #0 {
 entry:
   %add.ptr = getelementptr inbounds i64, i64* %a, i64 1
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
new file mode 100644
index 0000000000000..d06a0ea1e9b08
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -0,0 +1,243 @@
+; RUN: opt -attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
+; Copied from Transforms/FunctoinAttrs/nofree-attributor.ll
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test cases specifically designed for the "nofree" function attribute.
+; We use FIXME's to indicate problems and missing attributes.
+
+; Free functions
+declare void @free(i8* nocapture) local_unnamed_addr #1
+declare noalias i8* @realloc(i8* nocapture, i64) local_unnamed_addr #0
+declare void @_ZdaPv(i8*) local_unnamed_addr #2
+
+
+; TEST 1 (positive case)
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @only_return()
+define void @only_return() #0 {
+    ret void
+}
+
+
+; TEST 2 (negative case)
+; Only free
+; void only_free(char* p) {
+;    free(p);
+; }
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT: define void @only_free(i8* nocapture %0) local_unnamed_addr #1
+define void @only_free(i8* nocapture %0) local_unnamed_addr #0 {
+    tail call void @free(i8* %0) #1
+    ret void
+}
+
+
+; TEST 3 (negative case)
+; Free occurs in same scc.
+; void free_in_scc1(char*p){
+;    free_in_scc2(p);
+; }
+; void free_in_scc2(char*p){
+;    free_in_scc1(p);
+;    free(p);
+; }
+
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT :define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr
+define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr #0 {
+  tail call void @free_in_scc2(i8* %0) #1
+  ret void
+}
+
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR: define void @free_in_scc2(i8* nocapture %0) local_unnamed_addr
+define void @free_in_scc2(i8* nocapture %0) local_unnamed_addr #0 {
+  %cmp = icmp eq i8* %0, null
+  br i1 %cmp, label %rec, label %call
+call:
+  tail call void @free(i8* %0) #1
+  br label %end
+rec:
+  tail call void @free_in_scc1(i8* %0)
+  br label %end
+end:
+  ret void
+}
+
+
+; TEST 4 (positive case)
+; Free doesn't occur.
+; void mutual_recursion1(){
+;    mutual_recursion2();
+; }
+; void mutual_recursion2(){
+;     mutual_recursion1();
+; }
+
+
+; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @mutual_recursion1()
+define void @mutual_recursion1() #0 {
+  call void @mutual_recursion2()
+  ret void
+}
+
+; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @mutual_recursion2()
+define void @mutual_recursion2() #0 {
+  call void @mutual_recursion1()
+  ret void
+}
+
+
+; TEST 5
+; C++ delete operation (negative case)
+; void delete_op (char p[]){
+;     delete [] p;
+; }
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT: define void @_Z9delete_opPc(i8* %0) local_unnamed_addr #1
+define void @_Z9delete_opPc(i8* %0) local_unnamed_addr #0 {
+  %2 = icmp eq i8* %0, null
+  br i1 %2, label %4, label %3
+
+; <label>:3:                                      ; preds = %1
+  tail call void @_ZdaPv(i8* nonnull %0) #2
+  br label %4
+
+; <label>:4:                                      ; preds = %3, %1
+  ret void
+}
+
+
+; TEST 6 (negative case)
+; Call realloc
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT: define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr
+define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr #0 {
+    %ret = tail call i8* @realloc(i8* %0, i64 %1) #2
+    ret i8* %ret
+}
+
+
+; TEST 7 (positive case)
+; Call function declaration with "nofree"
+
+
+; ATTRIBUTOR: Function Attrs:  nofree noinline nounwind readnone uwtable 
+; ATTRIBUTOR-NEXT: declare void @nofree_function()
+declare void @nofree_function() nofree readnone #0
+
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @call_nofree_function()
+define void @call_nofree_function() #0 {
+    tail call void @nofree_function()
+    ret void
+}
+
+; TEST 8 (negative case)
+; Call function declaration without "nofree"
+
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NEXT: declare void @maybe_free()
+declare void @maybe_free() #0
+
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT: define void @call_maybe_free()
+define void @call_maybe_free() #0 {
+    tail call void @maybe_free()
+    ret void
+}
+
+
+; TEST 9 (negative case)
+; Call both of above functions
+
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR-NOT: nofree
+; ATTRIBUTOR-NEXT: define void @call_both()
+define void @call_both() #0 {
+    tail call void @maybe_free()
+    tail call void @nofree_function()
+    ret void
+}
+
+
+; TEST 10 (positive case)
+; Call intrinsic function
+; ATTRIBUTOR: Function Attrs: nounwind readnone speculatable
+; ATTRIBUTOR-NEXT: declare float @llvm.floor.f32(float)
+declare float @llvm.floor.f32(float)
+
+; FIXME: missing nofree
+; ATTRIBUTOR: Function Attrs: noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @call_floor(float %a)
+
+define void @call_floor(float %a) #0 {
+    tail call float @llvm.floor.f32(float %a)
+    ret void
+}
+
+; TEST 11 (positive case)
+; Check propagation.
+
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @f1()
+define void @f1() #0 {
+    tail call void @nofree_function()
+    ret void
+}
+
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define void @f2()
+define void @f2() #0 {
+    tail call void @f1()
+    ret void
+}
+
+; TEST 12 NoFree argument - positive.
+; ATTRIBUTOR: define double @test12(double* nocapture nofree nonnull readonly align 8 dereferenceable(8) %a)
+define double @test12(double* nocapture readonly %a) {
+entry:
+	%0 = load double, double* %a, align 8
+	%call = tail call double @cos(double %0) #2
+	ret double %call
+}
+
+declare double @cos(double) nobuiltin nounwind nofree
+
+; FIXME: %a should be nofree.
+; TEST 13 NoFree argument - positive.
+; ATTRIBUTOR: define noalias i32* @test13(i64* nocapture nonnull readonly align 8 dereferenceable(8) %a)
+define noalias i32* @test13(i64* nocapture readonly %a) {
+entry:
+	%0 = load i64, i64* %a, align 8
+	%call = tail call noalias i8* @malloc(i64 %0) #2
+	%1 = bitcast i8* %call to i32*
+	ret i32* %1
+}
+
+; ATTRIBUTOR: define void @test14(i8* nocapture %0, i8* nocapture nofree readnone %1)
+define void @test14(i8* nocapture %0, i8* nocapture %1) {
+	tail call void @free(i8* %0) #1
+	ret void
+}
+
+declare noalias i8* @malloc(i64)
+
+attributes #0 = { nounwind uwtable noinline }
+attributes #1 = { nounwind }
+attributes #2 = { nobuiltin nounwind }
diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll
new file mode 100644
index 0000000000000..260f04c273791
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/nonnull.ll
@@ -0,0 +1,817 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=ATTRIBUTOR,ATTRIBUTOR_OPM
+; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=ATTRIBUTOR,ATTRIBUTOR_NPM
+; Copied from Transforms/FunctoinAttrs/nonnull.ll
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare nonnull i8* @ret_nonnull()
+
+; Return a pointer trivially nonnull (call return attribute)
+define i8* @test1() {
+; ATTRIBUTOR: define nonnull i8* @test1
+  %ret = call i8* @ret_nonnull()
+  ret i8* %ret
+}
+
+; Return a pointer trivially nonnull (argument attribute)
+define i8* @test2(i8* nonnull %p) {
+; ATTRIBUTOR: define nonnull i8* @test2
+  ret i8* %p
+}
+
+; Given an SCC where one of the functions can not be marked nonnull,
+; can we still mark the other one which is trivially nonnull
+define i8* @scc_binder(i1 %c) {
+; ATTRIBUTOR: define noalias i8* @scc_binder
+  br i1 %c, label %rec, label %end
+rec:
+  call i8* @test3(i1 %c)
+  br label %end
+end:
+  ret i8* null
+}
+
+define i8* @test3(i1 %c) {
+; ATTRIBUTOR: define nonnull i8* @test3
+  call i8* @scc_binder(i1 %c)
+  %ret = call i8* @ret_nonnull()
+  ret i8* %ret
+}
+
+; Given a mutual recursive set of functions, we can mark them
+; nonnull if neither can ever return null.  (In this case, they
+; just never return period.)
+define i8* @test4_helper() {
+; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test4_helper
+  %ret = call i8* @test4()
+  ret i8* %ret
+}
+
+define i8* @test4() {
+; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test4
+  %ret = call i8* @test4_helper()
+  ret i8* %ret
+}
+
+; Given a mutual recursive set of functions which *can* return null
+; make sure we haven't marked them as nonnull.
+define i8* @test5_helper(i1 %c) {
+; ATTRIBUTOR: define noalias i8* @test5_helper
+  br i1 %c, label %rec, label %end
+rec:
+  %ret = call i8* @test5(i1 %c)
+  br label %end
+end:
+  ret i8* null
+}
+
+define i8* @test5(i1 %c) {
+; ATTRIBUTOR: define noalias i8* @test5
+  %ret = call i8* @test5_helper(i1 %c)
+  ret i8* %ret
+}
+
+; Local analysis, but going through a self recursive phi
+; ATTRIBUTOR: Function Attrs: noreturn
+; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test6a()
+define i8* @test6a() {
+entry:
+  %ret = call i8* @ret_nonnull()
+  br label %loop
+loop:
+  %phi = phi i8* [%ret, %entry], [%phi, %loop]
+  br i1 undef, label %loop, label %exit
+exit:
+  ret i8* %phi
+}
+
+; ATTRIBUTOR: define nonnull i8* @test6b(i1 %c)
+define i8* @test6b(i1 %c) {
+entry:
+  %ret = call i8* @ret_nonnull()
+  br label %loop
+loop:
+  %phi = phi i8* [%ret, %entry], [%phi, %loop]
+  br i1 %c, label %loop, label %exit
+exit:
+  ret i8* %phi
+}
+
+; ATTRIBUTOR: define i8* @test7
+define i8* @test7(i8* %a) {
+  %b = getelementptr inbounds i8, i8* %a, i64 0
+  ret i8* %b
+}
+
+; ATTRIBUTOR: define nonnull i8* @test8
+define i8* @test8(i8* %a) {
+  %b = getelementptr inbounds i8, i8* %a, i64 1
+  ret i8* %b
+}
+
+; ATTRIBUTOR: define i8* @test9
+define i8* @test9(i8* %a, i64 %n) {
+  %b = getelementptr inbounds i8, i8* %a, i64 %n
+  ret i8* %b
+}
+
+declare void @llvm.assume(i1)
+; FIXME: missing nonnull
+; ATTRIBUTOR: define i8* @test10
+define i8* @test10(i8* %a, i64 %n) {
+  %cmp = icmp ne i64 %n, 0
+  call void @llvm.assume(i1 %cmp)
+  %b = getelementptr inbounds i8, i8* %a, i64 %n
+  ret i8* %b
+}
+
+; TEST 11
+; char* test11(char *p) {
+;   return p? p: nonnull();
+; }
+; FIXME: missing nonnull
+; ATTRIBUTOR: define i8* @test11
+define i8* @test11(i8*) local_unnamed_addr {
+  %2 = icmp eq i8* %0, null
+  br i1 %2, label %3, label %5
+
+; <label>:3:                                      ; preds = %1
+  %4 = tail call i8* @ret_nonnull()
+  br label %5
+
+; <label>:5:                                      ; preds = %3, %1
+  %6 = phi i8* [ %4, %3 ], [ %0, %1 ]
+  ret i8* %6
+}
+
+; TEST 12
+; Simple CallSite Test
+declare void @test12_helper(i8*)
+define void @test12(i8* nonnull %a) {
+; ATTRIBUTOR: define void @test12(i8* nonnull %a)
+; ATTRIBUTOR-NEXT: tail call void @test12_helper(i8* nonnull %a)
+  tail call void @test12_helper(i8* %a)
+  ret void
+}
+
+; TEST 13
+; Simple Argument Tests
+declare i8* @unknown()
+define void @test13_helper() {
+  %nonnullptr = tail call i8* @ret_nonnull()
+  %maybenullptr = tail call i8* @unknown()
+  tail call void @test13(i8* %nonnullptr, i8* %nonnullptr, i8* %maybenullptr)
+  tail call void @test13(i8* %nonnullptr, i8* %maybenullptr, i8* %nonnullptr)
+  ret void
+}
+define internal void @test13(i8* %a, i8* %b, i8* %c) {
+; ATTRIBUTOR: define internal void @test13(i8* nocapture nofree nonnull readnone %a, i8* nocapture nofree readnone %b, i8* nocapture nofree readnone %c)
+  ret void
+}
+
+declare nonnull i8* @nonnull()
+
+; TEST 14
+; Complex propagation
+; Argument of f1, f2, f3 can be marked with nonnull.
+
+; * Argument
+; 1. In f1:bb6, %arg can be marked with nonnull because of the comparison in bb1
+; 2. Because f2 is internal function, f2(i32* %arg) -> @f2(i32* nonnull %arg)
+; 3. In f1:bb4 %tmp5 is nonnull and f3 is internal function.
+;    Then, f3(i32* %arg) -> @f3(i32* nonnull %arg)
+; 4. We get nonnull in whole f1 call sites so f1(i32* %arg) -> @f1(i32* nonnull %arg)
+
+
+define internal i32* @f1(i32* %arg) {
+; FIXME: missing nonnull It should be nonnull @f1(i32* nonnull readonly %arg)
+; ATTRIBUTOR: define internal nonnull i32* @f1(i32* nofree readonly %arg)
+
+bb:
+  %tmp = icmp eq i32* %arg, null
+  br i1 %tmp, label %bb9, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = load i32, i32* %arg, align 4
+  %tmp3 = icmp eq i32 %tmp2, 0
+  br i1 %tmp3, label %bb6, label %bb4
+
+bb4:                                              ; preds = %bb1
+  %tmp5 = getelementptr inbounds i32, i32* %arg, i64 1
+; ATTRIBUTOR: %tmp5b = tail call nonnull i32* @f3(i32* nofree nonnull %tmp5)
+  %tmp5b = tail call i32* @f3(i32* %tmp5)
+  %tmp5c = getelementptr inbounds i32, i32* %tmp5b, i64 -1
+  br label %bb9
+
+bb6:                                              ; preds = %bb1
+; FIXME: missing nonnull. It should be @f2(i32* nonnull %arg)
+; ATTRIBUTOR: %tmp7 = tail call nonnull i32* @f2(i32* nofree %arg)
+  %tmp7 = tail call i32* @f2(i32* %arg)
+  ret i32* %tmp7
+
+bb9:                                              ; preds = %bb4, %bb
+  %tmp10 = phi i32* [ %tmp5c, %bb4 ], [ inttoptr (i64 4 to i32*), %bb ]
+  ret i32* %tmp10
+}
+
+define internal i32* @f2(i32* %arg) {
+; FIXME: missing nonnull. It should be nonnull @f2(i32* nonnull %arg)
+; ATTRIBUTOR: define internal nonnull i32* @f2(i32* nofree readonly %arg)
+bb:
+
+; FIXME: missing nonnull. It should be @f1(i32* nonnull readonly %arg)
+; ATTRIBUTOR:   %tmp = tail call nonnull i32* @f1(i32* nofree %arg)
+  %tmp = tail call i32* @f1(i32* %arg)
+  ret i32* %tmp
+}
+
+define dso_local noalias i32* @f3(i32* %arg) {
+; FIXME: missing nonnull. It should be nonnull @f3(i32* nonnull readonly %arg)
+; ATTRIBUTOR: define dso_local noalias nonnull i32* @f3(i32* nofree readonly %arg)
+bb:
+; FIXME: missing nonnull. It should be @f1(i32* nonnull readonly %arg)
+; ATTRIBUTOR:   %tmp = call nonnull i32* @f1(i32* nofree %arg)
+  %tmp = call i32* @f1(i32* %arg)
+  ret i32* %tmp
+}
+
+; TEST 15
+define void @f15(i8* %arg) {
+; ATTRIBUTOR:   tail call void @use1(i8* nonnull dereferenceable(4) %arg)
+
+  tail call void @use1(i8* dereferenceable(4) %arg)
+  ret void
+}
+
+declare void @fun0() #1
+declare void @fun1(i8*) #1
+declare void @fun2(i8*, i8*) #1
+declare void @fun3(i8*, i8*, i8*) #1
+; TEST 16 simple path test
+; if(..)
+;   fun2(nonnull %a, nonnull %b)
+; else
+;   fun2(nonnull %a, %b)
+; We can say that %a is nonnull but %b is not.
+define void @f16(i8* %a, i8 * %b, i8 %c) {
+; FIXME: missing nonnull on %a
+; ATTRIBUTOR: define void @f16(i8* %a, i8* %b, i8 %c)
+  %cmp = icmp eq i8 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  tail call void @fun2(i8* nonnull %a, i8* nonnull %b)
+  ret void
+if.else:
+  tail call void @fun2(i8* nonnull %a, i8* %b)
+  ret void
+}
+; TEST 17 explore child BB test
+; if(..)
+;    ... (willreturn & nounwind)
+; else
+;    ... (willreturn & nounwind)
+; fun1(nonnull %a)
+; We can say that %a is nonnull
+define void @f17(i8* %a, i8 %c) {
+; ATTRIBUTOR: define void @f17(i8* nonnull %a, i8 %c)
+  %cmp = icmp eq i8 %c, 0
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  tail call void @fun0()
+  br label %cont
+if.else:
+  tail call void @fun0()
+  br label %cont
+cont:
+  tail call void @fun1(i8* nonnull %a)
+  ret void
+}
+; TEST 18 More complex test
+; if(..)
+;    ... (willreturn & nounwind)
+; else
+;    ... (willreturn & nounwind)
+; if(..)
+;    ... (willreturn & nounwind)
+; else
+;    ... (willreturn & nounwind)
+; fun1(nonnull %a)
+
+define void @f18(i8* %a, i8* %b, i8 %c) {
+; ATTRIBUTOR: define void @f18(i8* nonnull %a, i8* %b, i8 %c)
+  %cmp1 = icmp eq i8 %c, 0
+  br i1 %cmp1, label %if.then, label %if.else
+if.then:
+  tail call void @fun0()
+  br label %cont
+if.else:
+  tail call void @fun0()
+  br label %cont
+cont:
+  %cmp2 = icmp eq i8 %c, 1
+  br i1 %cmp2, label %cont.then, label %cont.else
+cont.then:
+  tail call void @fun1(i8* nonnull %b)
+  br label %cont2
+cont.else:
+  tail call void @fun0()
+  br label %cont2
+cont2:
+  tail call void @fun1(i8* nonnull %a)
+  ret void
+}
+
+; TEST 19: Loop
+
+define void @f19(i8* %a, i8* %b, i8 %c) {
+; FIXME: missing nonnull on %b
+; ATTRIBUTOR: define void @f19(i8* %a, i8* %b, i8 %c)
+  br label %loop.header
+loop.header:
+  %cmp2 = icmp eq i8 %c, 0
+  br i1 %cmp2, label %loop.body, label %loop.exit
+loop.body:
+  tail call void @fun1(i8* nonnull %b)
+  tail call void @fun1(i8* nonnull %a)
+  br label %loop.header
+loop.exit:
+  tail call void @fun1(i8* nonnull %b)
+  ret void
+}
+
+; Test propagation of nonnull callsite args back to caller.
+
+declare void @use1(i8* %x)
+declare void @use2(i8* %x, i8* %y);
+declare void @use3(i8* %x, i8* %y, i8* %z);
+
+declare void @use1nonnull(i8* nonnull %x);
+declare void @use2nonnull(i8* nonnull %x, i8* nonnull %y);
+declare void @use3nonnull(i8* nonnull %x, i8* nonnull %y, i8* nonnull %z);
+
+declare i8 @use1safecall(i8* %x) readonly nounwind ; readonly+nounwind guarantees that execution continues to successor
+
+; Can't extend non-null to parent for any argument because the 2nd call is not guaranteed to execute.
+
+define void @parent1(i8* %a, i8* %b, i8* %c) {
+; ATTRIBUTOR-LABEL: @parent1(i8* %a, i8* %b, i8* %c)
+; ATTRIBUTOR-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; ATTRIBUTOR-NEXT:    call void @use3nonnull(i8* nonnull %b, i8* nonnull %c, i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    ret void
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for all arguments.
+
+define void @parent2(i8* %a, i8* %b, i8* %c) {
+
+; ATTRIBUTOR-LABEL: @parent2(i8* nonnull %a, i8* nonnull %b, i8* nonnull %c)
+; ATTRIBUTOR-NEXT:    call void @use3nonnull(i8* nonnull %b, i8* nonnull %c, i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    call void @use3(i8* nonnull %c, i8* nonnull %a, i8* nonnull %b)
+
+; ATTRIBUTOR-NEXT:    ret void
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  ret void
+}
+
+; Extend non-null to parent for 1st argument.
+
+define void @parent3(i8* %a, i8* %b, i8* %c) {
+
+; ATTRIBUTOR-LABEL: @parent3(i8* nonnull %a, i8* %b, i8* %c)
+; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    call void @use3(i8* %c, i8* %b, i8* nonnull %a)
+
+; ATTRIBUTOR-NEXT:  ret void
+
+  call void @use1nonnull(i8* %a)
+  call void @use3(i8* %c, i8* %b, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for last 2 arguments.
+
+define void @parent4(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent4(i8* %a, i8* nonnull %b, i8* nonnull %c)
+; CHECK-NEXT:    call void @use2nonnull(i8* %c, i8* %b)
+; CHECK-NEXT:    call void @use2(i8* %a, i8* %c)
+; CHECK-NEXT:    call void @use1(i8* %b)
+
+; ATTRIBUTOR-LABEL: @parent4(i8* %a, i8* nonnull %b, i8* nonnull %c)
+; ATTRIBUTOR-NEXT:    call void @use2nonnull(i8* nonnull %c, i8* nonnull %b)
+; ATTRIBUTOR-NEXT:    call void @use2(i8* %a, i8* nonnull %c)
+; ATTRIBUTOR-NEXT:    call void @use1(i8* nonnull %b)
+
+; ATTRIBUTOR: ret void
+
+  call void @use2nonnull(i8* %c, i8* %b)
+  call void @use2(i8* %a, i8* %c)
+  call void @use1(i8* %b)
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; It appears benign to extend non-null to the parent in this case, but we can't do that
+; because it would incorrectly propagate the wrong information to its callers.
+
+define void @parent5(i8* %a, i1 %a_is_notnull) {
+; ATTRIBUTOR: @parent5(i8* %a, i1 %a_is_notnull)
+; ATTRIBUTOR-NEXT:    br i1 %a_is_notnull, label %t, label %f
+; ATTRIBUTOR:       t:
+; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    ret void
+; ATTRIBUTOR:       f:
+; ATTRIBUTOR-NEXT:    ret void
+
+  br i1 %a_is_notnull, label %t, label %f
+t:
+  call void @use1nonnull(i8* %a)
+  ret void
+f:
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; The volatile load can't trap, so we can guarantee that we'll get to the call.
+
+define i8 @parent6(i8* %a, i8* %b) {
+; ATTRIBUTOR-LABEL: @parent6(i8* nonnull %a, i8* %b)
+; ATTRIBUTOR-NEXT:    [[C:%.*]] = load volatile i8, i8* %b
+; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    ret i8 [[C]]
+
+  %c = load volatile i8, i8* %b
+  call void @use1nonnull(i8* %a)
+  ret i8 %c
+}
+
+; The nonnull callsite is guaranteed to execute, so the argument must be nonnull throughout the parent.
+
+define i8 @parent7(i8* %a) {
+
+
+; ATTRIBUTOR-LABEL: @parent7(i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(i8* nonnull %a)
+; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
+
+; ATTRIBUTOR-NEXT: ret i8 [[RET]]
+
+  %ret = call i8 @use1safecall(i8* %a)
+  call void @use1nonnull(i8* %a)
+  ret i8 %ret
+}
+
+; Make sure that an invoke works similarly to a call.
+
+declare i32 @esfp(...)
+
+define i1 @parent8(i8* %a, i8* %bogus1, i8* %b) personality i8* bitcast (i32 (...)* @esfp to i8*){
+; ATTRIBUTOR-LABEL: @parent8(i8* nonnull %a, i8* nocapture nofree readnone %bogus1, i8* nonnull %b)
+; ATTRIBUTOR-NEXT:  entry:
+; ATTRIBUTOR-NEXT:    invoke void @use2nonnull(i8* nonnull %a, i8* nonnull %b)
+; ATTRIBUTOR-NEXT:    to label %cont unwind label %exc
+; ATTRIBUTOR:       cont:
+; ATTRIBUTOR-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %b, null
+; ATTRIBUTOR-NEXT:    ret i1 [[NULL_CHECK]]
+; ATTRIBUTOR:       exc:
+; ATTRIBUTOR-NEXT:    [[LP:%.*]] = landingpad { i8*, i32 }
+; ATTRIBUTOR-NEXT:    filter [0 x i8*] zeroinitializer
+; ATTRIBUTOR-NEXT:    unreachable
+
+entry:
+  invoke void @use2nonnull(i8* %a, i8* %b)
+  to label %cont unwind label %exc
+
+cont:
+  %null_check = icmp eq i8* %b, null
+  ret i1 %null_check
+
+exc:
+  %lp = landingpad { i8*, i32 }
+  filter [0 x i8*] zeroinitializer
+  unreachable
+}
+
+; ATTRIBUTOR: define nonnull i32* @gep1(
+define i32* @gep1(i32* %p) {
+  %q = getelementptr inbounds i32, i32* %p, i32 1
+  ret i32* %q
+}
+
+define i32* @gep1_no_null_opt(i32* %p) #0 {
+; Should't be able to derive nonnull based on gep.
+; ATTRIBUTOR: define i32* @gep1_no_null_opt(
+  %q = getelementptr inbounds i32, i32* %p, i32 1
+  ret i32* %q
+}
+
+; ATTRIBUTOR: define i32 addrspace(3)* @gep2(
+define i32 addrspace(3)* @gep2(i32 addrspace(3)* %p) {
+  %q = getelementptr inbounds i32, i32 addrspace(3)* %p, i32 1
+  ret i32 addrspace(3)* %q
+}
+
+; FIXME: We should propagate dereferenceable here but *not* nonnull
+; ATTRIBUTOR: define dereferenceable_or_null(4) i32 addrspace(3)* @as(i32 addrspace(3)* nofree readnone returned dereferenceable(4) dereferenceable_or_null(4) %p)
+define i32 addrspace(3)* @as(i32 addrspace(3)* dereferenceable(4) %p) {
+  ret i32 addrspace(3)* %p
+}
+
+; ATTRIBUTOR: define internal nonnull i32* @g2()
+define internal i32* @g2() {
+  ret i32* inttoptr (i64 4 to i32*)
+}
+
+define  i32* @g1() {
+ %c = call i32* @g2()
+  ret i32* %c
+}
+
+declare void @use_i32_ptr(i32*) readnone nounwind
+; ATTRIBUTOR: define internal void @called_by_weak(i32* nocapture nonnull readnone %a)
+define internal void @called_by_weak(i32* %a) {
+  call void @use_i32_ptr(i32* %a)
+  ret void
+}
+
+; Check we do not annotate the function interface of this weak function.
+; ATTRIBUTOR: define weak_odr void @weak_caller(i32* nonnull %a)
+define weak_odr void @weak_caller(i32* nonnull %a) {
+  call void @called_by_weak(i32* %a)
+  ret void
+}
+
+; Expect nonnull
+; ATTRIBUTOR: define internal void @control(i32* nocapture nonnull readnone align 16 dereferenceable(8) %a)
+define internal void @control(i32* dereferenceable(4) %a) {
+  call void @use_i32_ptr(i32* %a)
+  ret void
+}
+; Avoid nonnull as we do not touch naked functions
+; ATTRIBUTOR: define internal void @naked(i32* dereferenceable(4) %a)
+define internal void @naked(i32* dereferenceable(4) %a) naked {
+  call void @use_i32_ptr(i32* %a)
+  ret void
+}
+; Avoid nonnull as we do not touch optnone
+; ATTRIBUTOR: define internal void @optnone(i32* dereferenceable(4) %a)
+define internal void @optnone(i32* dereferenceable(4) %a) optnone noinline {
+  call void @use_i32_ptr(i32* %a)
+  ret void
+}
+define void @make_live(i32* nonnull dereferenceable(8) %a) {
+  call void @naked(i32* nonnull dereferenceable(8) align 16 %a)
+  call void @control(i32* nonnull dereferenceable(8) align 16 %a)
+  call void @optnone(i32* nonnull dereferenceable(8) align 16 %a)
+  ret void
+}
+
+;int f(int *u, int n){
+;  for(int i = 0;i<n;i++){
+;    h(u);
+;  }
+;  return g(nonnull u);
+;}
+declare void @h(i32*) willreturn nounwind
+declare i32 @g(i32*) willreturn nounwind
+define i32 @nonnull_exec_ctx_1(i32* %a, i32 %b) {
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
+; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
+; ATTRIBUTOR-NEXT:  en:
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; ATTRIBUTOR:       ex:
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
+; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
+; ATTRIBUTOR:       hd:
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; ATTRIBUTOR-NEXT:    tail call void @h(i32* [[A]])
+; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+;
+en:
+  %tmp3 = icmp eq i32 %b, 0
+  br i1 %tmp3, label %ex, label %hd
+
+ex:
+  %tmp5 = tail call i32 @g(i32* nonnull %a)
+  ret i32 %tmp5
+
+hd:
+  %tmp7 = phi i32 [ %tmp8, %hd ], [ 0, %en ]
+  tail call void @h(i32* %a)
+  %tmp8 = add nuw i32 %tmp7, 1
+  %tmp9 = icmp eq i32 %tmp8, %b
+  br i1 %tmp9, label %ex, label %hd
+}
+
+define i32 @nonnull_exec_ctx_1b(i32* %a, i32 %b) {
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
+; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
+; ATTRIBUTOR-NEXT:  en:
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; ATTRIBUTOR:       ex:
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
+; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
+; ATTRIBUTOR:       hd:
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; ATTRIBUTOR-NEXT:    tail call void @h(i32* [[A]])
+; ATTRIBUTOR-NEXT:    br label [[HD2]]
+; ATTRIBUTOR:       hd2:
+; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+;
+en:
+  %tmp3 = icmp eq i32 %b, 0
+  br i1 %tmp3, label %ex, label %hd
+
+ex:
+  %tmp5 = tail call i32 @g(i32* nonnull %a)
+  ret i32 %tmp5
+
+hd:
+  %tmp7 = phi i32 [ %tmp8, %hd2 ], [ 0, %en ]
+  tail call void @h(i32* %a)
+  br label %hd2
+
+hd2:
+  %tmp8 = add nuw i32 %tmp7, 1
+  %tmp9 = icmp eq i32 %tmp8, %b
+  br i1 %tmp9, label %ex, label %hd
+}
+
+define i32 @nonnull_exec_ctx_2(i32* %a, i32 %b) willreturn nounwind {
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
+; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
+; ATTRIBUTOR-NEXT:  en:
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; ATTRIBUTOR:       ex:
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
+; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
+; ATTRIBUTOR:       hd:
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
+; ATTRIBUTOR-NEXT:    tail call void @h(i32* nonnull [[A]])
+; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+;
+en:
+  %tmp3 = icmp eq i32 %b, 0
+  br i1 %tmp3, label %ex, label %hd
+
+ex:
+  %tmp5 = tail call i32 @g(i32* nonnull %a)
+  ret i32 %tmp5
+
+hd:
+  %tmp7 = phi i32 [ %tmp8, %hd ], [ 0, %en ]
+  tail call void @h(i32* %a)
+  %tmp8 = add nuw i32 %tmp7, 1
+  %tmp9 = icmp eq i32 %tmp8, %b
+  br i1 %tmp9, label %ex, label %hd
+}
+
+define i32 @nonnull_exec_ctx_2b(i32* %a, i32 %b) willreturn nounwind {
+;
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
+; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
+; ATTRIBUTOR-NEXT:  en:
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
+; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
+; ATTRIBUTOR:       ex:
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
+; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
+; ATTRIBUTOR:       hd:
+; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
+; ATTRIBUTOR-NEXT:    tail call void @h(i32* nonnull [[A]])
+; ATTRIBUTOR-NEXT:    br label [[HD2]]
+; ATTRIBUTOR:       hd2:
+; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
+; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
+; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
+;
+en:
+  %tmp3 = icmp eq i32 %b, 0
+  br i1 %tmp3, label %ex, label %hd
+
+ex:
+  %tmp5 = tail call i32 @g(i32* nonnull %a)
+  ret i32 %tmp5
+
+hd:
+  %tmp7 = phi i32 [ %tmp8, %hd2 ], [ 0, %en ]
+  tail call void @h(i32* %a)
+  br label %hd2
+
+hd2:
+  %tmp8 = add nuw i32 %tmp7, 1
+  %tmp9 = icmp eq i32 %tmp8, %b
+  br i1 %tmp9, label %ex, label %hd
+}
+
+; Original from PR43833
+declare void @sink(i32*)
+
+; FIXME: the sink argument should be marked nonnull as in @PR43833_simple.
+define void @PR43833(i32* %0, i32 %1) {
+; ATTRIBUTOR-LABEL: @PR43833(
+; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1:%.*]], 1
+; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; ATTRIBUTOR:       4:
+; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; ATTRIBUTOR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
+; ATTRIBUTOR-NEXT:    br label [[TMP8:%.*]]
+; ATTRIBUTOR:       7:
+; ATTRIBUTOR-NEXT:    ret void
+; ATTRIBUTOR:       8:
+; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
+; ATTRIBUTOR-NEXT:    tail call void @sink(i32* [[TMP6]])
+; ATTRIBUTOR-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
+; ATTRIBUTOR-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
+; ATTRIBUTOR-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
+;
+  %3 = icmp sgt i32 %1, 1
+  br i1 %3, label %4, label %7
+
+4:                                                ; preds = %2
+  %5 = zext i32 %1 to i64
+  %6 = getelementptr inbounds i32, i32* %0, i64 %5
+  br label %8
+
+7:                                                ; preds = %8, %2
+  ret void
+
+8:                                                ; preds = %8, %4
+  %9 = phi i32 [ 1, %4 ], [ %10, %8 ]
+  tail call void @sink(i32* %6)
+  %10 = add nuw nsw i32 %9, 1
+  %11 = icmp eq i32 %10, %1
+  br i1 %11, label %7, label %8
+}
+
+; Adjusted from PR43833
+define void @PR43833_simple(i32* %0, i32 %1) {
+; ATTRIBUTOR_OPM-LABEL: @PR43833_simple(
+; ATTRIBUTOR_OPM-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; ATTRIBUTOR_OPM-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; ATTRIBUTOR_OPM:       4:
+; ATTRIBUTOR_OPM-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; ATTRIBUTOR_OPM-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
+; ATTRIBUTOR_OPM-NEXT:    br label [[TMP8:%.*]]
+; ATTRIBUTOR_OPM:       7:
+; ATTRIBUTOR_OPM-NEXT:    ret void
+; ATTRIBUTOR_OPM:       8:
+; ATTRIBUTOR_OPM-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
+; ATTRIBUTOR_OPM-NEXT:    tail call void @sink(i32* [[TMP6]])
+; ATTRIBUTOR_OPM-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
+; ATTRIBUTOR_OPM-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
+; ATTRIBUTOR_OPM-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
+;
+; ATTRIBUTOR_NPM-LABEL: @PR43833_simple(
+; ATTRIBUTOR_NPM-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; ATTRIBUTOR_NPM-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; ATTRIBUTOR_NPM:       4:
+; ATTRIBUTOR_NPM-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; ATTRIBUTOR_NPM-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
+; ATTRIBUTOR_NPM-NEXT:    br label [[TMP8:%.*]]
+; ATTRIBUTOR_NPM:       7:
+; ATTRIBUTOR_NPM-NEXT:    ret void
+; ATTRIBUTOR_NPM:       8:
+; ATTRIBUTOR_NPM-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
+; ATTRIBUTOR_NPM-NEXT:    tail call void @sink(i32* [[TMP6]])
+; ATTRIBUTOR_NPM-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
+; ATTRIBUTOR_NPM-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
+; ATTRIBUTOR_NPM-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
+;
+  %3 = icmp ne i32 %1, 0
+  br i1 %3, label %4, label %7
+
+4:                                                ; preds = %2
+  %5 = zext i32 %1 to i64
+  %6 = getelementptr inbounds i32, i32* %0, i64 %5
+  br label %8
+
+7:                                                ; preds = %8, %2
+  ret void
+
+8:                                                ; preds = %8, %4
+  %9 = phi i32 [ 1, %4 ], [ %10, %8 ]
+  tail call void @sink(i32* %6)
+  %10 = add nuw nsw i32 %9, 1
+  %11 = icmp eq i32 %10, %1
+  br i1 %11, label %7, label %8
+}
+
+attributes #0 = { "null-pointer-is-valid"="true" }
+attributes #1 = { nounwind willreturn}
diff --git a/llvm/test/Transforms/Attributor/norecurse.ll b/llvm/test/Transforms/Attributor/norecurse.ll
new file mode 100644
index 0000000000000..25f7fdee5f13f
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/norecurse.ll
@@ -0,0 +1,147 @@
+; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=ATTRIBUTOR
+; Copied from Transforms/FunctoinAttrs/norecurse.ll
+
+; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; ATTRIBUTOR-NEXT: define i32 @leaf()
+define i32 @leaf() {
+  ret i32 1
+}
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-SAME: readnone
+; ATTRIBUTOR-NOT: norecurse
+; ATTRIBUTOR-NEXT: define i32 @self_rec()
+define i32 @self_rec() {
+  %a = call i32 @self_rec()
+  ret i32 4
+}
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-SAME: readnone
+; ATTRIBUTOR-NOT: norecurse
+; ATTRIBUTOR-NEXT: define i32 @indirect_rec()
+define i32 @indirect_rec() {
+  %a = call i32 @indirect_rec2()
+  ret i32 %a
+}
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-SAME: readnone
+; ATTRIBUTOR-NOT: norecurse
+; ATTRIBUTOR-NEXT: define i32 @indirect_rec2()
+define i32 @indirect_rec2() {
+  %a = call i32 @indirect_rec()
+  ret i32 %a
+}
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-SAME: readnone
+; ATTRIBUTOR-NOT: norecurse
+; ATTRIBUTOR-NEXT: define i32 @extern()
+define i32 @extern() {
+  %a = call i32 @k()
+  ret i32 %a
+}
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-NEXT: declare i32 @k()
+declare i32 @k() readnone
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-NOT: norecurse
+; ATTRIBUTOR-NEXT: define void @intrinsic(i8* nocapture writeonly %dest, i8* nocapture readonly %src, i32 %len)
+define void @intrinsic(i8* %dest, i8* %src, i32 %len) {
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 false)
+  ret void
+}
+
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-NEXT: declare void @llvm.memcpy.p0i8.p0i8.i32
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i1)
+
+; ATTRIBUTOR: Function Attrs
+; FIXME: missing "norecurse"
+; ATTRIBUTOR-SAME: nosync readnone
+define internal i32 @called_by_norecurse() {
+  %a = call i32 @k()
+  ret i32 %a
+}
+; ATTRIBUTOR: Function Attrs
+; ATTRIBUTOR-NEXT: define void @m()
+define void @m() norecurse {
+  %a = call i32 @called_by_norecurse()
+  ret void
+}
+
+; ATTRIBUTOR: Function Attrs
+; FIXME: missing "norecurse"
+; ATTRIBUTOR-SAME: nosync
+define internal i32 @called_by_norecurse_indirectly() {
+  %a = call i32 @k()
+  ret i32 %a
+}
+define internal void @o() {
+  %a = call i32 @called_by_norecurse_indirectly()
+  ret void
+}
+define void @p() norecurse {
+  call void @o()
+  ret void
+}
+
+; ATTRIBUTOR: Function Attrs: nofree nosync nounwind
+; ATTRIBUTOR-NEXT: define void @f(i32 %x)
+define void @f(i32 %x)  {
+entry:
+  %x.addr = alloca i32, align 4
+  store i32 %x, i32* %x.addr, align 4
+  %0 = load i32, i32* %x.addr, align 4
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  call void @g() norecurse
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; ATTRIBUTOR: define void @g()
+define void @g() norecurse {
+entry:
+  call void @f(i32 0)
+  ret void
+}
+
+; ATTRIBUTOR-NOT: Function Attrs
+; ATTRIBUTOR: define linkonce_odr i32 @leaf_redefinable()
+define linkonce_odr i32 @leaf_redefinable() {
+  ret i32 1
+}
+
+; Call through a function pointer
+; ATTRIBUTOR-NOT: Function Attrs
+; ATTRIBUTOR: define i32 @eval_func1(i32 (i32)* nocapture nofree nonnull %0, i32 %1)
+define i32 @eval_func1(i32 (i32)* , i32) local_unnamed_addr {
+  %3 = tail call i32 %0(i32 %1) #2
+  ret i32 %3
+}
+
+; ATTRIBUTOR-NOT: Function Attrs
+; ATTRIBUTOR: define i32 @eval_func2(i32 (i32)* nocapture nofree %0, i32 %1)
+define i32 @eval_func2(i32 (i32)* , i32) local_unnamed_addr "null-pointer-is-valid"="true"{
+  %3 = tail call i32 %0(i32 %1) #2
+  ret i32 %3
+}
+
+declare void @unknown()
+; Call an unknown function in a dead block.
+; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
+; ATTRIBUTOR: define i32 @call_unknown_in_dead_block()
+define i32 @call_unknown_in_dead_block() local_unnamed_addr {
+  ret i32 0
+Dead:
+  tail call void @unknown()
+  ret i32 1
+}
+
diff --git a/llvm/test/Transforms/FunctionAttrs/fn_noreturn.ll b/llvm/test/Transforms/Attributor/noreturn.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/fn_noreturn.ll
rename to llvm/test/Transforms/Attributor/noreturn.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
similarity index 96%
rename from llvm/test/Transforms/FunctionAttrs/noreturn_async.ll
rename to llvm/test/Transforms/Attributor/noreturn_async.ll
index 3d2dd9f47da62..7c00a5a0b5cbf 100644
--- a/llvm/test/Transforms/FunctionAttrs/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -1,4 +1,4 @@
-; RUN: opt -functionattrs -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s
+; RUN: opt -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s
 ;
 ; This file is the same as noreturn_sync.ll but with a personality which
 ; indicates that the exception handler *can* catch asynchronous exceptions. As
diff --git a/llvm/test/Transforms/FunctionAttrs/noreturn_sync.ll b/llvm/test/Transforms/Attributor/noreturn_sync.ll
similarity index 96%
rename from llvm/test/Transforms/FunctionAttrs/noreturn_sync.ll
rename to llvm/test/Transforms/Attributor/noreturn_sync.ll
index b5a70e762716a..4e6f13737a7ef 100644
--- a/llvm/test/Transforms/FunctionAttrs/noreturn_sync.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_sync.ll
@@ -1,4 +1,4 @@
-; RUN: opt -functionattrs -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s
+; RUN: opt -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s
 ;
 ; This file is the same as noreturn_async.ll but with a personality which
 ; indicates that the exception handler *cannot* catch asynchronous exceptions.
diff --git a/llvm/test/Transforms/FunctionAttrs/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
similarity index 82%
rename from llvm/test/Transforms/FunctionAttrs/nosync.ll
rename to llvm/test/Transforms/Attributor/nosync.ll
index abb40cf266975..cd01a0caa58d7 100644
--- a/llvm/test/Transforms/FunctionAttrs/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -1,4 +1,3 @@
-; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefix=FNATTR
 ; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -25,8 +24,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.RT = type { i8, [10 x [20 x i32]], i8 }
 %struct.ST = type { i32, double, %struct.RT }
 
-; FNATTR: Function Attrs: norecurse nounwind optsize readnone ssp uwtable
-; FNATTR-NEXT: define nonnull i32* @foo(%struct.ST* readnone %s)
 ; ATTRIBUTOR: Function Attrs: nofree nosync nounwind optsize readnone ssp uwtable
 ; ATTRIBUTOR-NEXT: define nonnull i32* @foo(%struct.ST* nofree nonnull readnone "no-capture-maybe-returned" %s)
 define i32* @foo(%struct.ST* %s) nounwind uwtable readnone optsize ssp {
@@ -42,8 +39,6 @@ entry:
 ;   return n;
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define i32 @load_monotonic(i32* nocapture readonly %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind uwtable
 ; ATTRIBUTOR-NEXT: define i32 @load_monotonic(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %0)
 define i32 @load_monotonic(i32* nocapture readonly %0) norecurse nounwind uwtable {
@@ -58,8 +53,6 @@ define i32 @load_monotonic(i32* nocapture readonly %0) norecurse nounwind uwtabl
 ;   atomic_load_explicit(num, memory_order_relaxed);
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define void @store_monotonic(i32* nocapture %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind uwtable
 ; ATTRIBUTOR-NEXT: define void @store_monotonic(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) %0)
 define void @store_monotonic(i32* nocapture %0) norecurse nounwind uwtable {
@@ -74,8 +67,6 @@ define void @store_monotonic(i32* nocapture %0) norecurse nounwind uwtable {
 ;   return n;
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define i32 @load_acquire(i32* nocapture readonly %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define i32 @load_acquire(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %0)
@@ -90,8 +81,6 @@ define i32 @load_acquire(i32* nocapture readonly %0) norecurse nounwind uwtable
 ;   atomic_store_explicit(num, 10, memory_order_release);
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define void @load_release(i32* nocapture %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define void @load_release(i32* nocapture nofree writeonly align 4 %0)
@@ -102,8 +91,6 @@ define void @load_release(i32* nocapture %0) norecurse nounwind uwtable {
 
 ; TEST 6 - negative volatile, relaxed atomic
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define void @load_volatile_release(i32* nocapture %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define void @load_volatile_release(i32* nocapture nofree writeonly align 4 %0)
@@ -118,8 +105,6 @@ define void @load_volatile_release(i32* nocapture %0) norecurse nounwind uwtable
 ;   *num = 14;
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define void @volatile_store(i32* %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define void @volatile_store(i32* nofree align 4 %0)
@@ -135,8 +120,6 @@ define void @volatile_store(i32* %0) norecurse nounwind uwtable {
 ;   return n;
 ; }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind uwtable
-; FNATTR-NEXT: define i32 @volatile_load(i32* %0)
 ; ATTRIBUTOR: Function Attrs: nofree norecurse nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define i32 @volatile_load(i32* nofree align 4 %0)
@@ -147,14 +130,10 @@ define i32 @volatile_load(i32* %0) norecurse nounwind uwtable {
 
 ; TEST 9
 
-; FNATTR: Function Attrs: noinline nosync nounwind uwtable
-; FNATTR-NEXT: declare void @nosync_function()
 ; ATTRIBUTOR: Function Attrs: noinline nosync nounwind uwtable
 ; ATTRIBUTOR-NEXT: declare void @nosync_function()
 declare void @nosync_function() noinline nounwind uwtable nosync
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NEXT: define void @call_nosync_function()
 ; ATTRIBUTOR: Function Attrs: noinline nosync nounwind uwtable
 ; ATTRIBUTOR-next: define void @call_nosync_function()
 define void @call_nosync_function() nounwind uwtable noinline {
@@ -164,14 +143,10 @@ define void @call_nosync_function() nounwind uwtable noinline {
 
 ; TEST 10 - negative, should not deduce nosync
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NEXT: declare void @might_sync()
 ; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
 ; ATTRIBUTOR-NEXT: declare void @might_sync()
 declare void @might_sync() noinline nounwind uwtable
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NEXT: define void @call_might_sync()
 ; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR-NEXT: define void @call_might_sync()
@@ -183,8 +158,6 @@ define void @call_might_sync() nounwind uwtable noinline {
 ; TEST 11 - positive, should deduce nosync
 ; volatile operation in same scc but dead. Call volatile_load defined in TEST 8.
 
-; FNATTR: Function Attrs: nofree noinline nounwind uwtable
-; FNATTR-NEXT: define i32 @scc1(i32* %0)
 ; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define i32 @scc1(i32* nocapture nofree readnone %0)
 define i32 @scc1(i32* %0) noinline nounwind uwtable {
@@ -193,8 +166,6 @@ define i32 @scc1(i32* %0) noinline nounwind uwtable {
   ret i32 %val;
 }
 
-; FNATTR: Function Attrs: nofree noinline nounwind uwtable
-; FNATTR-NEXT: define void @scc2(i32* %0)
 ; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define void @scc2(i32* nocapture nofree readnone %0)
 define void @scc2(i32* %0) noinline nounwind uwtable {
@@ -221,8 +192,6 @@ define void @scc2(i32* %0) noinline nounwind uwtable {
 %"struct.std::atomic" = type { %"struct.std::__atomic_base" }
 %"struct.std::__atomic_base" = type { i8 }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind
-; FNATTR-NEXT: define void @foo1(i32* nocapture %0, %"struct.std::atomic"* nocapture %1)
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR: define void @foo1(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) %0, %"struct.std::atomic"* nocapture nofree nonnull writeonly dereferenceable(1) %1)
 
@@ -234,8 +203,6 @@ define void @foo1(i32* %0, %"struct.std::atomic"* %1) {
   ret void
 }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind
-; FNATTR-NEXT: define void @bar(i32* nocapture readnone %0, %"struct.std::atomic"* nocapture readonly %1)
 ; ATTRIBUTOR-NOT: nosync
 ; ATTRIBUTOR: define void @bar(i32* nocapture nofree readnone %0, %"struct.std::atomic"* nocapture nofree nonnull readonly dereferenceable(1) %1)
 define void @bar(i32* %0, %"struct.std::atomic"* %1) {
@@ -254,8 +221,6 @@ define void @bar(i32* %0, %"struct.std::atomic"* %1) {
 }
 
 ; TEST 13 - Fence syncscope("singlethread") seq_cst
-; FNATTR: Function Attrs: nofree norecurse nounwind
-; FNATTR-NEXT: define void @foo1_singlethread(i32* nocapture %0, %"struct.std::atomic"* nocapture %1)
 ; ATTRIBUTOR: Function Attrs: nofree nosync nounwind willreturn
 ; ATTRIBUTOR: define void @foo1_singlethread(i32* nocapture nofree nonnull writeonly align 4 dereferenceable(4) %0, %"struct.std::atomic"* nocapture nofree nonnull writeonly dereferenceable(1) %1)
 
@@ -267,8 +232,6 @@ define void @foo1_singlethread(i32* %0, %"struct.std::atomic"* %1) {
   ret void
 }
 
-; FNATTR: Function Attrs: nofree norecurse nounwind
-; FNATTR-NEXT: define void @bar_singlethread(i32* nocapture readnone %0, %"struct.std::atomic"* nocapture readonly %1)
 ; ATTRIBUTOR: Function Attrs: nofree nosync nounwind
 ; ATTRIBUTOR: define void @bar_singlethread(i32* nocapture nofree readnone %0, %"struct.std::atomic"* nocapture nofree nonnull readonly dereferenceable(1) %1)
 define void @bar_singlethread(i32* %0, %"struct.std::atomic"* %1) {
diff --git a/llvm/test/Transforms/Attributor/nounwind.ll b/llvm/test/Transforms/Attributor/nounwind.ll
new file mode 100644
index 0000000000000..e569095c8d41d
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/nounwind.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S | FileCheck %s --check-prefix=ATTRIBUTOR
+; Copied from Transforms/FunctoinAttrs/nounwind.ll
+
+; TEST 1
+; ATTRIBUTOR: Function Attrs: nofree nosync nounwind
+; ATTRIBUTOR-NEXT: define i32 @foo1()
+define i32 @foo1() {
+  ret i32 1
+}
+
+; TEST 2
+; ATTRIBUTOR: Function Attrs: nofree noreturn nosync nounwind
+; ATTRIBUTOR-NEXT: define i32 @scc1_foo()
+define i32 @scc1_foo() {
+  %1 = call i32 @scc1_bar()
+  ret i32 1
+}
+
+
+; TEST 3
+; ATTRIBUTOR: Function Attrs: nofree noreturn nosync nounwind
+; ATTRIBUTOR-NEXT: define i32 @scc1_bar()
+define i32 @scc1_bar() {
+  %1 = call i32 @scc1_foo()
+  ret i32 1
+}
+
+declare i32 @non_nounwind()
+
+; TEST 4
+; ATTRIBUTOR: define void @call_non_nounwind() {
+define void @call_non_nounwind(){
+    tail call i32 @non_nounwind()
+    ret void
+}
+
+; TEST 5 - throw
+; int maybe_throw(bool canThrow) {
+;   if (canThrow)
+;     throw;
+;   else
+;     return -1;
+; }
+
+; ATTRIBUTOR: define i32 @maybe_throw(i1 zeroext %0)
+define i32 @maybe_throw(i1 zeroext %0) {
+  br i1 %0, label %2, label %3
+
+2:                                                ; preds = %1
+  tail call void @__cxa_rethrow() #1
+  unreachable
+
+3:                                                ; preds = %1
+  ret i32 -1
+}
+
+declare void @__cxa_rethrow()
+
+; TEST 6 - catch
+; int catch_thing() {
+;   try {
+;       int a = doThing(true);
+;   }
+;   catch(...) { return -1; }
+;   return 1;
+; }
+
+; ATTRIBUTOR: define i32 @catch_thing()
+define i32 @catch_thing() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  invoke void @__cxa_rethrow() #1
+          to label %1 unwind label %2
+
+1:                                                ; preds = %0
+  unreachable
+
+2:                                                ; preds = %0
+  %3 = landingpad { i8*, i32 }
+          catch i8* null
+  %4 = extractvalue { i8*, i32 } %3, 0
+  %5 = tail call i8* @__cxa_begin_catch(i8* %4) #2
+  tail call void @__cxa_end_catch()
+  ret i32 -1
+}
+
+define i32 @catch_thing_user() {
+; ATTRIBUTOR:     define i32 @catch_thing_user
+; ATTRIBUTOR-NEXT: %catch_thing_call = call
+; ATTRIBUTOR-NEXT: ret i32 -1
+  %catch_thing_call = call i32 @catch_thing()
+  ret i32 %catch_thing_call
+}
+
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/llvm/test/Transforms/FunctionAttrs/read_write_returned_arguments_scc.ll b/llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/read_write_returned_arguments_scc.ll
rename to llvm/test/Transforms/Attributor/read_write_returned_arguments_scc.ll
diff --git a/llvm/test/Transforms/Attributor/readattrs.ll b/llvm/test/Transforms/Attributor/readattrs.ll
new file mode 100644
index 0000000000000..9c148ef160bf8
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/readattrs.ll
@@ -0,0 +1,145 @@
+; RUN: opt < %s -attributor -attributor-disable=false -S -attributor-annotate-decl-cs | FileCheck %s --check-prefixes=ATTRIBUTOR
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='attributor' -attributor-disable=false -S -attributor-annotate-decl-cs | FileCheck %s --check-prefixes=ATTRIBUTOR
+; Copied from Transforms/FunctionAttrs/readattrs.ll
+
+@x = global i32 0
+
+declare void @test1_1(i8* %x1_1, i8* readonly %y1_1, ...)
+
+; NOTE: readonly for %y1_2 would be OK here but not for the similar situation in test13.
+;
+; ATTRIBUTOR: define void @test1_2(i8* %x1_2, i8* %y1_2, i8* %z1_2)
+define void @test1_2(i8* %x1_2, i8* %y1_2, i8* %z1_2) {
+  call void (i8*, i8*, ...) @test1_1(i8* %x1_2, i8* %y1_2, i8* %z1_2)
+  store i32 0, i32* @x
+  ret void
+}
+
+; ATTRIBUTOR: define i8* @test2(i8* nofree readnone returned %p)
+define i8* @test2(i8* %p) {
+  store i32 0, i32* @x
+  ret i8* %p
+}
+
+; ATTRIBUTOR: define i1 @test3(i8* nofree readnone %p, i8* nofree readnone %q)
+define i1 @test3(i8* %p, i8* %q) {
+  %A = icmp ult i8* %p, %q
+  ret i1 %A
+}
+
+declare void @test4_1(i8* nocapture) readonly
+
+; ATTRIBUTOR: define void @test4_2(i8* nocapture readonly %p)
+define void @test4_2(i8* %p) {
+  call void @test4_1(i8* %p)
+  ret void
+}
+
+; ATTRIBUTOR: define void @test5(i8** nocapture nofree nonnull writeonly dereferenceable(8) %p, i8* nofree writeonly %q)
+; Missed optz'n: we could make %q readnone, but don't break test6!
+define void @test5(i8** %p, i8* %q) {
+  store i8* %q, i8** %p
+  ret void
+}
+
+declare void @test6_1()
+; ATTRIBUTOR: define void @test6_2(i8** nocapture nonnull writeonly dereferenceable(8) %p, i8* %q)
+; This is not a missed optz'n.
+define void @test6_2(i8** %p, i8* %q) {
+  store i8* %q, i8** %p
+  call void @test6_1()
+  ret void
+}
+
+; ATTRIBUTOR: define void @test7_1(i32* inalloca nocapture nofree writeonly %a)
+; inalloca parameters are always considered written
+define void @test7_1(i32* inalloca %a) {
+  ret void
+}
+
+; ATTRIBUTOR: define i32* @test8_1(i32* nofree readnone returned %p)
+define i32* @test8_1(i32* %p) {
+entry:
+  ret i32* %p
+}
+
+; ATTRIBUTOR: define void @test8_2(i32* nocapture nofree writeonly %p)
+define void @test8_2(i32* %p) {
+entry:
+  %call = call i32* @test8_1(i32* %p)
+  store i32 10, i32* %call, align 4
+  ret void
+}
+
+; ATTRIBUTOR: declare void @llvm.masked.scatter
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>)
+
+; ATTRIBUTOR-NOT: readnone
+; ATTRIBUTOR-NOT: readonly
+; ATTRIBUTOR: define void @test9
+define void @test9(<4 x i32*> %ptrs, <4 x i32>%val) {
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+; ATTRIBUTOR: declare <4 x i32> @llvm.masked.gather
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+; ATTRIBUTOR: readonly
+; ATTRIBUTOR: define <4 x i32> @test10
+define <4 x i32> @test10(<4 x i32*> %ptrs) {
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
+  ret <4 x i32> %res
+}
+
+; ATTRIBUTOR: declare <4 x i32> @test11_1
+declare <4 x i32> @test11_1(<4 x i32*>) argmemonly nounwind readonly
+; ATTRIBUTOR: readonly
+; ATTRIBUTOR-NOT: readnone
+; ATTRIBUTOR: define <4 x i32> @test11_2
+define <4 x i32> @test11_2(<4 x i32*> %ptrs) {
+  %res = call <4 x i32> @test11_1(<4 x i32*> %ptrs)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @test12_1(<4 x i32*>) argmemonly nounwind
+; ATTRIBUTOR-NOT: readnone
+; ATTRIBUTOR: define <4 x i32> @test12_2
+define <4 x i32> @test12_2(<4 x i32*> %ptrs) {
+  %res = call <4 x i32> @test12_1(<4 x i32*> %ptrs)
+  ret <4 x i32> %res
+}
+
+; ATTRIBUTOR: define i32 @volatile_load(
+; ATTRIBUTOR-NOT: readonly
+; ATTRIBUTOR: ret
+define i32 @volatile_load(i32* %p) {
+  %load = load volatile i32, i32* %p
+  ret i32 %load
+}
+
+declare void @escape_readnone_ptr(i8** %addr, i8* readnone %ptr)
+declare void @escape_readonly_ptr(i8** %addr, i8* readonly %ptr)
+
+; The argument pointer %escaped_then_written cannot be marked readnone/only even
+; though the only direct use, in @escape_readnone_ptr/@escape_readonly_ptr,
+; is marked as readnone/only. However, the functions can write the pointer into
+; %addr, causing the store to write to %escaped_then_written.
+;
+;
+; ATTRIBUTOR: define void @unsound_readnone(i8* nocapture nofree readnone %ignored, i8* %escaped_then_written)
+; ATTRIBUTOR: define void @unsound_readonly(i8* nocapture nofree readnone %ignored, i8* %escaped_then_written)
+define void @unsound_readnone(i8* %ignored, i8* %escaped_then_written) {
+  %addr = alloca i8*
+  call void @escape_readnone_ptr(i8** %addr, i8* %escaped_then_written)
+  %addr.ld = load i8*, i8** %addr
+  store i8 0, i8* %addr.ld
+  ret void
+}
+
+define void @unsound_readonly(i8* %ignored, i8* %escaped_then_written) {
+  %addr = alloca i8*
+  call void @escape_readonly_ptr(i8** %addr, i8* %escaped_then_written)
+  %addr.ld = load i8*, i8** %addr
+  store i8 0, i8* %addr.ld
+  ret void
+}
diff --git a/llvm/test/Transforms/Attributor/returned.ll b/llvm/test/Transforms/Attributor/returned.ll
new file mode 100644
index 0000000000000..85ab69a0d99a9
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/returned.ll
@@ -0,0 +1,812 @@
+; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
+; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -attributor-annotate-decl-cs -functionattrs -S < %s | FileCheck %s --check-prefix=BOTH
+;
+; Copied from Transforms/FunctoinAttrs/read_write_returned_arguments_scc.ll
+; 
+; Test cases specifically designed for the "returned" argument attribute.
+; We use FIXME's to indicate problems and missing attributes.
+;
+
+; TEST SCC test returning an integer value argument
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT: define i32 @sink_r0(i32 returned %r)
+; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; BOTH-NEXT: define i32 @scc_r1(i32 %a, i32 returned %r, i32 %b)
+; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; BOTH-NEXT: define i32 @scc_r2(i32 %a, i32 %b, i32 returned %r)
+; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; BOTH-NEXT: define i32 @scc_rX(i32 %a, i32 %b, i32 %r)
+;
+;
+; ATTRIBUTOR: define i32 @sink_r0(i32 returned %r)
+; ATTRIBUTOR: define i32 @scc_r1(i32 %a, i32 returned %r, i32 %b)
+; ATTRIBUTOR: define i32 @scc_r2(i32 %a, i32 %b, i32 returned %r)
+; ATTRIBUTOR: define i32 @scc_rX(i32 %a, i32 %b, i32 %r)
+;
+; int scc_r1(int a, int b, int r);
+; int scc_r2(int a, int b, int r);
+;
+; __attribute__((noinline)) int sink_r0(int r) {
+;   return r;
+; }
+;
+; __attribute__((noinline)) int scc_r1(int a, int r, int b) {
+;   return scc_r2(r, a, sink_r0(r));
+; }
+;
+; __attribute__((noinline)) int scc_r2(int a, int b, int r) {
+;   if (a > b)
+;     return scc_r2(b, a, sink_r0(r));
+;   if (a < b)
+;     return scc_r1(sink_r0(b), scc_r2(scc_r1(a, b, r), scc_r1(a, scc_r2(r, r, r), r), scc_r2(a, b, r)), scc_r1(a, b, r));
+;   return a == b ? r : scc_r2(a, b, r);
+; }
+; __attribute__((noinline)) int scc_rX(int a, int b, int r) {
+;   if (a > b)
+;     return scc_r2(b, a, sink_r0(r));
+;   if (a < b)                                                                         // V Diff to scc_r2
+;     return scc_r1(sink_r0(b), scc_r2(scc_r1(a, b, r), scc_r1(a, scc_r2(r, r, r), r), scc_r1(a, b, r)), scc_r1(a, b, r));
+;   return a == b ? r : scc_r2(a, b, r);
+; }
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define i32 @sink_r0(i32 %r) #0 {
+entry:
+  ret i32 %r
+}
+
+define i32 @scc_r1(i32 %a, i32 %r, i32 %b) #0 {
+entry:
+  %call = call i32 @sink_r0(i32 %r)
+  %call1 = call i32 @scc_r2(i32 %r, i32 %a, i32 %call)
+  ret i32 %call1
+}
+
+define i32 @scc_r2(i32 %a, i32 %b, i32 %r) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @sink_r0(i32 %r)
+  %call1 = call i32 @scc_r2(i32 %b, i32 %a, i32 %call)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmp2 = icmp slt i32 %a, %b
+  br i1 %cmp2, label %if.then3, label %if.end12
+
+if.then3:                                         ; preds = %if.end
+  %call4 = call i32 @sink_r0(i32 %b)
+  %call5 = call i32 @scc_r1(i32 %a, i32 %b, i32 %r)
+  %call6 = call i32 @scc_r2(i32 %r, i32 %r, i32 %r)
+  %call7 = call i32 @scc_r1(i32 %a, i32 %call6, i32 %r)
+  %call8 = call i32 @scc_r2(i32 %a, i32 %b, i32 %r)
+  %call9 = call i32 @scc_r2(i32 %call5, i32 %call7, i32 %call8)
+  %call10 = call i32 @scc_r1(i32 %a, i32 %b, i32 %r)
+  %call11 = call i32 @scc_r1(i32 %call4, i32 %call9, i32 %call10)
+  br label %return
+
+if.end12:                                         ; preds = %if.end
+  %cmp13 = icmp eq i32 %a, %b
+  br i1 %cmp13, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %if.end12
+  br label %cond.end
+
+cond.false:                                       ; preds = %if.end12
+  %call14 = call i32 @scc_r2(i32 %a, i32 %b, i32 %r)
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %r, %cond.true ], [ %call14, %cond.false ]
+  br label %return
+
+return:                                           ; preds = %cond.end, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ %call11, %if.then3 ], [ %cond, %cond.end ]
+  ret i32 %retval.0
+}
+
+define i32 @scc_rX(i32 %a, i32 %b, i32 %r) #0 {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @sink_r0(i32 %r)
+  %call1 = call i32 @scc_r2(i32 %b, i32 %a, i32 %call)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmp2 = icmp slt i32 %a, %b
+  br i1 %cmp2, label %if.then3, label %if.end12
+
+if.then3:                                         ; preds = %if.end
+  %call4 = call i32 @sink_r0(i32 %b)
+  %call5 = call i32 @scc_r1(i32 %a, i32 %b, i32 %r)
+  %call6 = call i32 @scc_r2(i32 %r, i32 %r, i32 %r)
+  %call7 = call i32 @scc_r1(i32 %a, i32 %call6, i32 %r)
+  %call8 = call i32 @scc_r1(i32 %a, i32 %b, i32 %r)
+  %call9 = call i32 @scc_r2(i32 %call5, i32 %call7, i32 %call8)
+  %call10 = call i32 @scc_r1(i32 %a, i32 %b, i32 %r)
+  %call11 = call i32 @scc_r1(i32 %call4, i32 %call9, i32 %call10)
+  br label %return
+
+if.end12:                                         ; preds = %if.end
+  %cmp13 = icmp eq i32 %a, %b
+  br i1 %cmp13, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %if.end12
+  br label %cond.end
+
+cond.false:                                       ; preds = %if.end12
+  %call14 = call i32 @scc_r2(i32 %a, i32 %b, i32 %r)
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ %r, %cond.true ], [ %call14, %cond.false ]
+  br label %return
+
+return:                                           ; preds = %cond.end, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ %call11, %if.then3 ], [ %cond, %cond.end ]
+  ret i32 %retval.0
+}
+
+
+; TEST SCC test returning a pointer value argument
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @ptr_sink_r0(double* nofree readnone returned "no-capture-maybe-returned" %r)
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @ptr_scc_r1(double* nofree readnone %a, double* nofree readnone returned %r, double* nocapture nofree readnone %b)
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @ptr_scc_r2(double* nofree readnone %a, double* nofree readnone %b, double* nofree readnone returned %r)
+;
+; double* ptr_scc_r1(double* a, double* b, double* r);
+; double* ptr_scc_r2(double* a, double* b, double* r);
+;
+; __attribute__((noinline)) double* ptr_sink_r0(double* r) {
+;   return r;
+; }
+;
+; __attribute__((noinline)) double* ptr_scc_r1(double* a, double* r, double* b) {
+;   return ptr_scc_r2(r, a, ptr_sink_r0(r));
+; }
+;
+; __attribute__((noinline)) double* ptr_scc_r2(double* a, double* b, double* r) {
+;   if (a > b)
+;     return ptr_scc_r2(b, a, ptr_sink_r0(r));
+;   if (a < b)
+;     return ptr_scc_r1(ptr_sink_r0(b), ptr_scc_r2(ptr_scc_r1(a, b, r), ptr_scc_r1(a, ptr_scc_r2(r, r, r), r), ptr_scc_r2(a, b, r)), ptr_scc_r1(a, b, r));
+;   return a == b ? r : ptr_scc_r2(a, b, r);
+; }
+define double* @ptr_sink_r0(double* %r) #0 {
+entry:
+  ret double* %r
+}
+
+define double* @ptr_scc_r1(double* %a, double* %r, double* %b) #0 {
+entry:
+  %call = call double* @ptr_sink_r0(double* %r)
+  %call1 = call double* @ptr_scc_r2(double* %r, double* %a, double* %call)
+  ret double* %call1
+}
+
+define double* @ptr_scc_r2(double* %a, double* %b, double* %r) #0 {
+entry:
+  %cmp = icmp ugt double* %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call double* @ptr_sink_r0(double* %r)
+  %call1 = call double* @ptr_scc_r2(double* %b, double* %a, double* %call)
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %cmp2 = icmp ult double* %a, %b
+  br i1 %cmp2, label %if.then3, label %if.end12
+
+if.then3:                                         ; preds = %if.end
+  %call4 = call double* @ptr_sink_r0(double* %b)
+  %call5 = call double* @ptr_scc_r1(double* %a, double* %b, double* %r)
+  %call6 = call double* @ptr_scc_r2(double* %r, double* %r, double* %r)
+  %call7 = call double* @ptr_scc_r1(double* %a, double* %call6, double* %r)
+  %call8 = call double* @ptr_scc_r2(double* %a, double* %b, double* %r)
+  %call9 = call double* @ptr_scc_r2(double* %call5, double* %call7, double* %call8)
+  %call10 = call double* @ptr_scc_r1(double* %a, double* %b, double* %r)
+  %call11 = call double* @ptr_scc_r1(double* %call4, double* %call9, double* %call10)
+  br label %return
+
+if.end12:                                         ; preds = %if.end
+  %cmp13 = icmp eq double* %a, %b
+  br i1 %cmp13, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %if.end12
+  br label %cond.end
+
+cond.false:                                       ; preds = %if.end12
+  %call14 = call double* @ptr_scc_r2(double* %a, double* %b, double* %r)
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi double* [ %r, %cond.true ], [ %call14, %cond.false ]
+  br label %return
+
+return:                                           ; preds = %cond.end, %if.then3, %if.then
+  %retval.0 = phi double* [ %call1, %if.then ], [ %call11, %if.then3 ], [ %cond, %cond.end ]
+  ret double* %retval.0
+}
+
+
+; TEST a no-return singleton SCC
+;
+; int* rt0(int *a) {
+;   return *a ? a : rt0(a);
+; }
+;
+; BOTH:      Function Attrs: nofree noinline norecurse noreturn nosync nounwind readonly uwtable
+; BOTH-NEXT: define noalias nonnull align 536870912 dereferenceable(4294967295) i32* @rt0(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %a)
+define i32* @rt0(i32* %a) #0 {
+entry:
+  %v = load i32, i32* %a, align 4
+  %tobool = icmp ne i32 %v, 0
+  %call = call i32* @rt0(i32* %a)
+  %sel = select i1 %tobool, i32* %a, i32* %call
+  ret i32* %sel
+}
+
+; TEST a no-return singleton SCC
+;
+; int* rt1(int *a) {
+;   return *a ? undef : rt1(a);
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse noreturn nosync nounwind readonly uwtable
+; BOTH-NEXT:    define noalias nonnull align 536870912 dereferenceable(4294967295) i32* @rt1(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %a)
+define i32* @rt1(i32* %a) #0 {
+entry:
+  %v = load i32, i32* %a, align 4
+  %tobool = icmp ne i32 %v, 0
+  %call = call i32* @rt1(i32* %a)
+  %sel = select i1 %tobool, i32* undef, i32* %call
+  ret i32* %sel
+}
+
+; TEST another SCC test
+;
+; BOTH:    define i32* @rt2_helper(i32* nofree readnone returned %a)
+; BOTH:    define i32* @rt2(i32* nofree readnone %a, i32* nofree readnone "no-capture-maybe-returned" %b)
+define i32* @rt2_helper(i32* %a) #0 {
+entry:
+  %call = call i32* @rt2(i32* %a, i32* %a)
+  ret i32* %call
+}
+
+define i32* @rt2(i32* %a, i32 *%b) #0 {
+entry:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %call = call i32* @rt2_helper(i32* %a)
+  br label %if.end
+
+if.end:
+  %sel = phi i32* [ %b, %entry], [%call, %if.then]
+  ret i32* %sel
+}
+
+; TEST another SCC test
+;
+; BOTH:    define i32* @rt3_helper(i32* nofree readnone %a, i32* nofree readnone returned "no-capture-maybe-returned" %b)
+; BOTH:    define i32* @rt3(i32* nofree readnone %a, i32* nofree readnone returned "no-capture-maybe-returned" %b)
+define i32* @rt3_helper(i32* %a, i32* %b) #0 {
+entry:
+  %call = call i32* @rt3(i32* %a, i32* %b)
+  ret i32* %call
+}
+
+define i32* @rt3(i32* %a, i32 *%b) #0 {
+entry:
+  %cmp = icmp eq i32* %a, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %call = call i32* @rt3_helper(i32* %a, i32* %b)
+  br label %if.end
+
+if.end:
+  %sel = phi i32* [ %b, %entry], [%call, %if.then]
+  ret i32* %sel
+}
+
+; TEST address taken function with call to an external functions
+;
+;  void unknown_fn(void *);
+;
+;  int* calls_unknown_fn(int *r) {
+;    unknown_fn(&calls_unknown_fn);
+;    return r;
+;  }
+;
+; BOTH: declare void @unknown_fn(i32* (i32*)*)
+;
+; BOTH:       Function Attrs: noinline nounwind uwtable
+; BOTH-NEXT:  define i32* @calls_unknown_fn(i32* readnone returned "no-capture-maybe-returned" %r)
+; ATTRIBUTOR: define i32* @calls_unknown_fn(i32* readnone returned "no-capture-maybe-returned" %r)
+declare void @unknown_fn(i32* (i32*)*) #0
+
+define i32* @calls_unknown_fn(i32* %r) #0 {
+  tail call void @unknown_fn(i32* (i32*)* nonnull @calls_unknown_fn)
+  ret i32* %r
+}
+
+
+; TEST call to a function that might be redifined at link time
+;
+;  int *maybe_redefined_fn(int *r) {
+;    return r;
+;  }
+;
+;  int *calls_maybe_redefined_fn(int *r) {
+;    maybe_redefined_fn(r);
+;    return r;
+;  }
+;
+; Verify the maybe-redefined function is not annotated:
+;
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR: define linkonce_odr i32* @maybe_redefined_fn(i32* %r)
+;
+; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
+; ATTRIBUTOR: define i32* @calls_maybe_redefined_fn(i32* returned %r)
+;
+; BOTH: Function Attrs: noinline nounwind uwtable
+; BOTH-NEXT: define linkonce_odr i32* @maybe_redefined_fn(i32* %r)
+;
+; BOTH: Function Attrs: noinline nounwind uwtable
+; BOTH-NEXT: define i32* @calls_maybe_redefined_fn(i32* returned %r)
+define linkonce_odr i32* @maybe_redefined_fn(i32* %r) #0 {
+entry:
+  ret i32* %r
+}
+
+define i32* @calls_maybe_redefined_fn(i32* %r) #0 {
+entry:
+  %call = call i32* @maybe_redefined_fn(i32* %r)
+  ret i32* %r
+}
+
+; TEST return call to a function that might be redifined at link time
+;
+;  int *maybe_redefined_fn2(int *r) {
+;    return r;
+;  }
+;
+;  int *calls_maybe_redefined_fn2(int *r) {
+;    return maybe_redefined_fn2(r);
+;  }
+;
+; Verify the maybe-redefined function is not annotated:
+;
+; BOTH: Function Attrs: noinline nounwind uwtable
+; BOTH-NEXT: define linkonce_odr i32* @maybe_redefined_fn2(i32* %r)
+; BOTH: Function Attrs: noinline nounwind uwtable
+; BOTH-NEXT: define i32* @calls_maybe_redefined_fn2(i32* %r)
+;
+; ATTRIBUTOR: define i32* @calls_maybe_redefined_fn2(i32* %r)
+define linkonce_odr i32* @maybe_redefined_fn2(i32* %r) #0 {
+entry:
+  ret i32* %r
+}
+
+define i32* @calls_maybe_redefined_fn2(i32* %r) #0 {
+entry:
+  %call = call i32* @maybe_redefined_fn2(i32* %r)
+  ret i32* %call
+}
+
+
+; TEST returned argument goes through select and phi
+;
+; double select_and_phi(double b) {
+;   double x = b;
+;   if (b > 0)
+;     x = b;
+;   return b == 0? b : x;
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT: define double @select_and_phi(double returned %b)
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double @select_and_phi(double returned %b)
+define double @select_and_phi(double %b) #0 {
+entry:
+  %cmp = fcmp ogt double %b, 0.000000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %phi = phi double [ %b, %if.then ], [ %b, %entry ]
+  %cmp1 = fcmp oeq double %b, 0.000000e+00
+  %sel = select i1 %cmp1, double %b, double %phi
+  ret double %sel
+}
+
+
+; TEST returned argument goes through recursion, select, and phi
+;
+; double recursion_select_and_phi(int a, double b) {
+;   double x = b;
+;   if (a-- > 0)
+;     x = recursion_select_and_phi(a, b);
+;   return b == 0? b : x;
+; }
+;
+; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; BOTH-NEXT: define double @recursion_select_and_phi(i32 %a, double returned %b)
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double @recursion_select_and_phi(i32 %a, double returned %b)
+define double @recursion_select_and_phi(i32 %a, double %b) #0 {
+entry:
+  %dec = add nsw i32 %a, -1
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %call = call double @recursion_select_and_phi(i32 %dec, double %b)
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %phi = phi double [ %call, %if.then ], [ %b, %entry ]
+  %cmp1 = fcmp oeq double %b, 0.000000e+00
+  %sel = select i1 %cmp1, double %b, double %phi
+  ret double %sel
+}
+
+
+; TEST returned argument goes through bitcasts
+;
+; double* bitcast(int* b) {
+;   return (double*)b;
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT:  define double* @bitcast(i32* nofree readnone returned "no-capture-maybe-returned" %b)
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @bitcast(i32* nofree readnone returned "no-capture-maybe-returned" %b)
+define double* @bitcast(i32* %b) #0 {
+entry:
+  %bc0 = bitcast i32* %b to double*
+  ret double* %bc0
+}
+
+
+; TEST returned argument goes through select and phi interleaved with bitcasts
+;
+; double* bitcasts_select_and_phi(int* b) {
+;   double* x = b;
+;   if (b == 0)
+;     x = b;
+;   return b != 0 ? b : x;
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT: define double* @bitcasts_select_and_phi(i32* nofree readnone returned %b)
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @bitcasts_select_and_phi(i32* nofree readnone returned %b)
+define double* @bitcasts_select_and_phi(i32* %b) #0 {
+entry:
+  %bc0 = bitcast i32* %b to double*
+  %cmp = icmp eq double* %bc0, null
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %bc1 = bitcast i32* %b to double*
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %phi = phi double* [ %bc1, %if.then ], [ %bc0, %entry ]
+  %bc2 = bitcast double* %phi to i8*
+  %bc3 = bitcast i32* %b to i8*
+  %cmp2 = icmp ne double* %bc0, null
+  %sel = select i1 %cmp2, i8* %bc2, i8* %bc3
+  %bc4 = bitcast i8* %sel to double*
+  ret double* %bc4
+}
+
+
+; TEST return argument or argument or undef
+;
+; double* ret_arg_arg_undef(int* b) {
+;   if (b == 0)
+;     return (double*)b;
+;   if (b == 0)
+;     return (double*)b;
+;   /* return undef */
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT:  define double* @ret_arg_arg_undef(i32* nofree readnone returned %b)
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @ret_arg_arg_undef(i32* nofree readnone returned %b)
+define double* @ret_arg_arg_undef(i32* %b) #0 {
+entry:
+  %bc0 = bitcast i32* %b to double*
+  %cmp = icmp eq double* %bc0, null
+  br i1 %cmp, label %ret_arg0, label %if.end
+
+ret_arg0:
+  %bc1 = bitcast i32* %b to double*
+  ret double* %bc1
+
+if.end:
+  br i1 %cmp, label %ret_arg1, label %ret_undef
+
+ret_arg1:
+  ret double* %bc0
+
+ret_undef:
+  ret double *undef
+}
+
+
+; TEST return undef or argument or argument
+;
+; double* ret_undef_arg_arg(int* b) {
+;   if (b == 0)
+;     return (double*)b;
+;   if (b == 0)
+;     return (double*)b;
+;   /* return undef */
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT:  define double* @ret_undef_arg_arg(i32* nofree readnone returned %b)
+;
+;
+; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
+; ATTRIBUTOR-NEXT: define double* @ret_undef_arg_arg(i32* nofree readnone returned %b)
+define double* @ret_undef_arg_arg(i32* %b) #0 {
+entry:
+  %bc0 = bitcast i32* %b to double*
+  %cmp = icmp eq double* %bc0, null
+  br i1 %cmp, label %ret_undef, label %if.end
+
+ret_undef:
+  ret double *undef
+
+if.end:
+  br i1 %cmp, label %ret_arg0, label %ret_arg1
+
+ret_arg0:
+  ret double* %bc0
+
+ret_arg1:
+  %bc1 = bitcast i32* %b to double*
+  ret double* %bc1
+}
+
+
+; TEST return undef or argument or undef
+;
+; double* ret_undef_arg_undef(int* b) {
+;   if (b == 0)
+;     /* return undef */
+;   if (b == 0)
+;     return (double*)b;
+;   /* return undef */
+; }
+;
+; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
+; BOTH-NEXT:  define double* @ret_undef_arg_undef(i32* nofree readnone returned %b)
+;
+; ATTRIBUTOR: define double* @ret_undef_arg_undef(i32* nofree readnone returned %b)
+define double* @ret_undef_arg_undef(i32* %b) #0 {
+entry:
+  %bc0 = bitcast i32* %b to double*
+  %cmp = icmp eq double* %bc0, null
+  br i1 %cmp, label %ret_undef0, label %if.end
+
+ret_undef0:
+  ret double *undef
+
+if.end:
+  br i1 %cmp, label %ret_arg, label %ret_undef1
+
+ret_arg:
+  ret double* %bc0
+
+ret_undef1:
+  ret double *undef
+}
+
+; TEST return argument or unknown call result
+;
+; int* ret_arg_or_unknown(int* b) {
+;   if (b == 0)
+;     return b;
+;   return unknown();
+; }
+;
+; Verify we do not assume b is returned
+;
+; ATTRIBUTOR: define i32* @ret_arg_or_unknown(i32* %b)
+; ATTRIBUTOR: define i32* @ret_arg_or_unknown_through_phi(i32* %b)
+; BOTH:       define i32* @ret_arg_or_unknown(i32* %b)
+; BOTH:       define i32* @ret_arg_or_unknown_through_phi(i32* %b)
+declare i32* @unknown(i32*)
+
+define i32* @ret_arg_or_unknown(i32* %b) #0 {
+entry:
+  %cmp = icmp eq i32* %b, null
+  br i1 %cmp, label %ret_arg, label %ret_unknown
+
+ret_arg:
+  ret i32* %b
+
+ret_unknown:
+  %call = call i32* @unknown(i32* %b)
+  ret i32* %call
+}
+
+define i32* @ret_arg_or_unknown_through_phi(i32* %b) #0 {
+entry:
+  %cmp = icmp eq i32* %b, null
+  br i1 %cmp, label %ret_arg, label %ret_unknown
+
+ret_arg:
+  br label %r
+
+ret_unknown:
+  %call = call i32* @unknown(i32* %b)
+  br label %r
+
+r:
+  %phi = phi i32* [ %b, %ret_arg ], [ %call, %ret_unknown ]
+  ret i32* %phi
+}
+
+; TEST inconsistent IR in dead code.
+;
+; ATTRIBUTOR: define i32 @deadblockcall1(i32 returned %A)
+; ATTRIBUTOR: define i32 @deadblockcall2(i32 returned %A)
+; ATTRIBUTOR: define i32 @deadblockphi1(i32 returned %A)
+; ATTRIBUTOR: define i32 @deadblockphi2(i32 returned %A)
+; BOTH:       define i32 @deadblockcall1(i32 returned %A)
+; BOTH:       define i32 @deadblockcall2(i32 returned %A)
+; BOTH:       define i32 @deadblockphi1(i32 returned %A)
+; BOTH:       define i32 @deadblockphi2(i32 returned %A)
+define i32 @deadblockcall1(i32 %A) #0 {
+entry:
+  ret i32 %A
+unreachableblock:
+  %B = call i32 @deadblockcall1(i32 %B)
+  ret i32 %B
+}
+
+declare i32 @deadblockcall_helper(i32 returned %A);
+
+define i32 @deadblockcall2(i32 %A) #0 {
+entry:
+  ret i32 %A
+unreachableblock1:
+  %B = call i32 @deadblockcall_helper(i32 %B)
+  ret i32 %B
+unreachableblock2:
+  %C = call i32 @deadblockcall1(i32 %C)
+  ret i32 %C
+}
+
+define i32 @deadblockphi1(i32 %A) #0 {
+entry:
+  br label %r
+unreachableblock1:
+  %B = call i32 @deadblockcall_helper(i32 %B)
+  ret i32 %B
+unreachableblock2:
+  %C = call i32 @deadblockcall1(i32 %C)
+  br label %r
+r:
+  %PHI = phi i32 [%A, %entry], [%C, %unreachableblock2]
+  ret i32 %PHI
+}
+
+define i32 @deadblockphi2(i32 %A) #0 {
+entry:
+  br label %r
+unreachableblock1:
+  %B = call i32 @deadblockcall_helper(i32 %B)
+  br label %unreachableblock3
+unreachableblock2:
+  %C = call i32 @deadblockcall1(i32 %C)
+  br label %unreachableblock3
+unreachableblock3:
+  %PHI1 = phi i32 [%B, %unreachableblock1], [%C, %unreachableblock2]
+  br label %r
+r:
+  %PHI2 = phi i32 [%A, %entry], [%PHI1, %unreachableblock3]
+  ret i32 %PHI2
+}
+
+declare void @noreturn() noreturn;
+
+define i32 @deadblockphi3(i32 %A, i1 %c) #0 {
+entry:
+  br i1 %c, label %r, label %unreachablecall
+unreachablecall:
+  call void @noreturn();
+  %B = call i32 @deadblockcall_helper(i32 0)
+  br label %unreachableblock3
+unreachableblock2:
+  %C = call i32 @deadblockcall1(i32 %C)
+  br label %unreachableblock3
+unreachableblock3:
+  %PHI1 = phi i32 [%B, %unreachablecall], [%C, %unreachableblock2]
+  br label %r
+r:
+  %PHI2 = phi i32 [%A, %entry], [%PHI1, %unreachableblock3]
+  ret i32 %PHI2
+}
+
+define weak_odr i32 @non_exact_0() {
+  ret i32 0
+}
+define weak_odr i32 @non_exact_1(i32 %a) {
+  ret i32 %a
+}
+define weak_odr i32 @non_exact_2(i32 returned %a) {
+  ret i32 %a
+}
+define weak_odr i32* @non_exact_3(i32* align 32 returned %a) {
+  ret i32* %a
+}
+define i32 @exact(i32* %a) {
+  %c0 = call i32 @non_exact_0()
+  %c1 = call i32 @non_exact_1(i32 1)
+  %c2 = call i32 @non_exact_2(i32 2)
+  %c3 = call i32* @non_exact_3(i32* %a)
+; We can use the information of the weak function non_exact_3 because it was
+; given to us and not derived (the alignment of the returned argument).
+; ATTRIBUTOR:  %c4 = load i32, i32* %c3, align 32
+  %c4 = load i32, i32* %c3
+; FIXME: %c2 and %c3 should be replaced but not %c0 or %c1!
+; ATTRIBUTOR:  %add1 = add i32 %c0, %c1
+; ATTRIBUTOR:  %add2 = add i32 %add1, %c2
+; ATTRIBUTOR:  %add3 = add i32 %add2, %c4
+  %add1 = add i32 %c0, %c1
+  %add2 = add i32 %add1, %c2
+  %add3 = add i32 %add2, %c4
+  ret i32 %add3
+}
+
+@G = external global i8
+define i32* @ret_const() #0 {
+  %bc = bitcast i8* @G to i32*
+  ret i32* %bc
+}
+define i32* @use_const() #0 {
+  %c = call i32* @ret_const()
+  ; ATTRIBUTOR: ret i32* bitcast (i8* @G to i32*)
+  ret i32* %c
+}
+define i32* @dont_use_const() #0 {
+  %c = musttail call i32* @ret_const()
+  ; ATTRIBUTOR: ret i32* %c
+  ret i32* %c
+}
+
+attributes #0 = { noinline nounwind uwtable }
diff --git a/llvm/test/Transforms/FunctionAttrs/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
similarity index 100%
rename from llvm/test/Transforms/FunctionAttrs/value-simplify.ll
rename to llvm/test/Transforms/Attributor/value-simplify.ll
diff --git a/llvm/test/Transforms/FunctionAttrs/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
similarity index 78%
rename from llvm/test/Transforms/FunctionAttrs/willreturn.ll
rename to llvm/test/Transforms/Attributor/willreturn.ll
index 8f7e46837c1e7..90fef48b2a900 100644
--- a/llvm/test/Transforms/FunctionAttrs/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -1,4 +1,3 @@
-; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefix=FNATTR
 ; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
 
 
@@ -9,8 +8,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 
 ; TEST 1 (positive case)
-; FNATTR: Function Attrs: noinline norecurse nounwind readnone uwtable
-; FNATTR-NEXT: define void @only_return()
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable willreturn
 ; ATTRIBUTOR-NEXT: define void @only_return()
 define void @only_return() #0 {
@@ -25,8 +22,6 @@ define void @only_return() #0 {
 ;    return n<=1? n : fib(n-1) + fib(n-2);
 ; }
 
-; FNATTR: Function Attrs: noinline nounwind readnone uwtable
-; FNATTR-NEXT: define i32 @fib(i32 %0)
 ; FIXME: missing willreturn
 ; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define i32 @fib(i32 %0) local_unnamed_addr
@@ -56,9 +51,6 @@ define i32 @fib(i32 %0) local_unnamed_addr #0 {
 ; }
 ; fact_maybe_not(-1) doesn't stop.
 
-; FNATTR: Function Attrs: noinline norecurse nounwind readnone uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define i32 @fact_maybe_not_halt(i32 %0) local_unnamed_addr
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define i32 @fact_maybe_not_halt(i32 %0) local_unnamed_addr
@@ -93,8 +85,6 @@ define i32 @fact_maybe_not_halt(i32 %0) local_unnamed_addr #0 {
 ; }
 
 ; FIXME: missing willreturn
-; FNATTR: Function Attrs: noinline norecurse nounwind readnone uwtable
-; FNATTR-NEXT: define i32 @fact_loop(i32 %0)
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define i32 @fact_loop(i32 %0) local_unnamed_addr
 define i32 @fact_loop(i32 %0) local_unnamed_addr #0 {
@@ -123,9 +113,6 @@ define i32 @fact_loop(i32 %0) local_unnamed_addr #0 {
 ;     mutual_recursion1();
 ; }
 
-; FNATTR: Function Attrs: noinline nounwind readnone uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @mutual_recursion1(i1 %c)
 ; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @mutual_recursion1(i1 %c)
@@ -139,9 +126,6 @@ end:
 }
 
 
-; FNATTR: Function Attrs: noinline nounwind readnone uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @mutual_recursion2(i1 %c)
 ; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @mutual_recursion2(i1 %c)
@@ -153,15 +137,10 @@ define void @mutual_recursion2(i1 %c) #0 {
 
 ; TEST 5 (negative case)
 ; call exit/abort (has noreturn attribute)
-; FNATTR: Function Attrs: noreturn
-; FNATTR-NEXT: declare void @exit(i32) local_unnamed_addr
 ; ATTRIBUTOR: Function Attrs: noreturn
 ; ATTRIBUTOR-NEXT: declare void @exit(i32) local_unnamed_add
 declare void @exit(i32 %0) local_unnamed_addr noreturn
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @only_exit()
 ; ATTRIBUTOR: Function Attrs: noinline noreturn nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @only_exit() local_unnamed_addr
@@ -180,9 +159,6 @@ define void @only_exit() local_unnamed_addr #0 {
 ;     }
 ;     return;
 ; }
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @conditional_exit(i32 %0, i32* nocapture readonly %1) local_unnamed_addr
 ; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @conditional_exit(i32 %0, i32* nocapture readonly %1) local_unnamed_addr
@@ -210,14 +186,10 @@ define void @conditional_exit(i32 %0, i32* nocapture readonly %1) local_unnamed_
 ; TEST 6 (positive case)
 ; Call intrinsic function
 ; FIXME: missing willreturn
-; FNATTRS: Function Attrs: noinline readnone speculatable
-; FNATTRS-NEXT: declare float @llvm.floor.f32(float %0)
 ; ATTRIBUTOR: Function Attrs: nounwind readnone speculatable
 ; ATTRIBUTOR-NEXT: declare float @llvm.floor.f32(float)
 declare float @llvm.floor.f32(float)
 
-; FNATTRS: Function Attrs: noinline nounwind readnone uwtable
-; FNATTRS-NEXT: define void @call_floor(float %a)
 ; FIXME: missing willreturn
 ; ATTRIBUTOR: Function Attrs: noinline nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define void @call_floor(float %a)
@@ -230,17 +202,11 @@ define void @call_floor(float %a) #0 {
 ; TEST 7 (negative case)
 ; Call function declaration without willreturn
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: declare void @maybe_noreturn()
 ; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: declare void @maybe_noreturn()
 declare void @maybe_noreturn() #0
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @call_maybe_noreturn()
 ; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @call_maybe_noreturn()
@@ -253,14 +219,10 @@ define void @call_maybe_noreturn() #0 {
 ; TEST 8 (positive case)
 ; Check propagation.
 
-; FNATTR: Function Attrs: norecurse willreturn
-; FNATTR-NEXT: declare void @will_return()
 ; ATTRIBUTOR: Function Attrs: norecurse willreturn
 ; ATTRIBUTOR-NEXT: declare void @will_return()
 declare void @will_return() willreturn norecurse
 
-; FNATTR: Function Attrs: noinline norecurse nounwind uwtable
-; FNATTR-NEXT: define void @f1()
 ; ATTRIBUTOR: Function Attrs: noinline norecurse nounwind uwtable willreturn
 ; ATTRIBUTOR-NEXT: define void @f1()
 define void @f1() #0 {
@@ -268,8 +230,6 @@ define void @f1() #0 {
     ret void
 }
 
-; FNATTR: Function Attrs: noinline norecurse nounwind uwtable
-; FNATTR-NEXT: define void @f2()
 ; ATTRIBUTOR: Function Attrs: noinline norecurse nounwind uwtable willreturn
 ; ATTRIBUTOR-NEXT: define void @f2()
 define void @f2() #0 {
@@ -281,9 +241,6 @@ define void @f2() #0 {
 ; TEST 9 (negative case)
 ; call willreturn function in endless loop.
 
-; FNATTR: Function Attrs: noinline norecurse nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @call_will_return_but_has_loop()
 ; ATTRIBUTOR: Function Attrs: noinline norecurse noreturn nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @call_will_return_but_has_loop()
@@ -300,14 +257,10 @@ label2:
 ; TEST 10 (positive case)
 ; invoke a function with willreturn
 
-; FNATTR: Function Attrs: noinline uwtable willreturn
-; FNATTR-NEXT: declare i1 @maybe_raise_exception()
 ; ATTRIBUTOR: Function Attrs: noinline uwtable willreturn
 ; ATTRIBUTOR-NEXT: declare i1 @maybe_raise_exception()
 declare i1 @maybe_raise_exception() #1 willreturn
 
-; FNATTR: Function Attrs: nounwind
-; FNATTR-NEXT: define void @invoke_test()
 ; ATTRIBUTOR: Function Attrs: nounwind willreturn
 ; ATTRIBUTOR-NEXT: define void @invoke_test()
 define void @invoke_test() personality i32 (...)* @__gxx_personality_v0 {
@@ -335,8 +288,6 @@ declare i32 @__gxx_personality_v0(...)
 ; }
 
 ; FIXME: missing willreturn
-; FNATTR: Function Attrs: noinline norecurse nounwind readonly uwtable
-; FNATTR-NEXT: define i32 @loop_constant_trip_count(i32* nocapture readonly %0)
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readonly uwtable
 ; ATTRIBUTOR-NEXT: define i32 @loop_constant_trip_count(i32* nocapture nofree readonly %0)
 define i32 @loop_constant_trip_count(i32* nocapture readonly %0) #0 {
@@ -367,9 +318,6 @@ define i32 @loop_constant_trip_count(i32* nocapture readonly %0) #0 {
 ;     }
 ;     return ans;
 ; }
-; FNATTR: Function Attrs: noinline norecurse nounwind readonly uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define i32 @loop_trip_count_unbound(i32 %0, i32 %1, i32* nocapture readonly %2, i32 %3) local_unnamed_addr
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readonly uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define i32 @loop_trip_count_unbound(i32 %0, i32 %1, i32* nocapture nofree readonly %2, i32 %3) local_unnamed_addr
@@ -406,8 +354,6 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, i32* nocapture readonly %2,
 
 
 ; FIXME: missing willreturn
-; FNATTR: Function Attrs: noinline norecurse nounwind readonly uwtable
-; FNATTR-NEXT: define i32 @loop_trip_dec(i32 %0, i32* nocapture readonly %1)
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readonly uwtable
 ; ATTRIBUTOR-NEXT: define i32 @loop_trip_dec(i32 %0, i32* nocapture nofree readonly %1) local_unnamed_addr
 
@@ -437,8 +383,6 @@ define i32 @loop_trip_dec(i32 %0, i32* nocapture readonly %1) local_unnamed_addr
 ; TEST 14 (positive case)
 ; multiple return
 
-; FNATTR: Function Attrs: noinline norecurse nounwind readnone uwtable
-; FNATTR-NEXT: define i32 @multiple_return(i32 %a)
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable willreturn
 ; ATTRIBUTOR-NEXT: define i32 @multiple_return(i32 %a)
 define i32 @multiple_return(i32 %a) #0 {
@@ -455,8 +399,6 @@ f:
 ; unreachable exit
 
 ; 15.1 (positive case)
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NEXT: define void @unreachable_exit_positive1()
 ; ATTRIBUTOR: Function Attrs: noinline norecurse nounwind uwtable willreturn
 ; ATTRIBUTOR-NEXT: define void @unreachable_exit_positive1()
 define void @unreachable_exit_positive1() #0 {
@@ -469,8 +411,6 @@ unreachable_label:
 }
 
 ; FIXME: missing willreturn
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NEXT: define i32 @unreachable_exit_positive2(i32 %0)
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NEXT: define i32 @unreachable_exit_positive2(i32 %0)
 define i32 @unreachable_exit_positive2(i32) local_unnamed_addr #0 {
@@ -497,9 +437,6 @@ unreachable_label:
 
 ;15.2
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @unreachable_exit_negative1()
 ; ATTRIBUTOR: Function Attrs: noinline noreturn nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @unreachable_exit_negative1()
@@ -512,9 +449,6 @@ unreachable_label:
   unreachable
 }
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @unreachable_exit_negative2()
 ; ATTRIBUTOR: Function Attrs: nofree noinline norecurse noreturn nosync nounwind readnone uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @unreachable_exit_negative2()
@@ -531,15 +465,10 @@ unreachable_label:
   unreachable
 }
 
-; FNATTR: Function Attrs: noreturn nounwind
-; FNATTR-NEXT: declare void @llvm.eh.sjlj.longjmp(i8*)
 ; ATTRIBUTOR: Function Attrs: noreturn nounwind
 ; ATTRIBUTOR-NEXT: declare void @llvm.eh.sjlj.longjmp(i8*)
 declare void @llvm.eh.sjlj.longjmp(i8*)
 
-; FNATTR: Function Attrs: noinline nounwind uwtable
-; FNATTR-NOT: willreturn
-; FNATTR-NEXT: define void @call_longjmp(i8* nocapture readnone %0) local_unnamed_addr #3 {
 ; ATTRIBUTOR: Function Attrs: noinline noreturn nounwind uwtable
 ; ATTRIBUTOR-NOT: willreturn
 ; ATTRIBUTOR-NEXT: define void @call_longjmp(i8* nocapture readnone %0) local_unnamed_addr
diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
new file mode 100644
index 0000000000000..92007ef424132
--- /dev/null
+++ b/llvm/test/Transforms/ConstantHoisting/AArch64/const-hoist-intrinsics.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm64-darwin-unknown -S -consthoist < %s | FileCheck %s
+
+; Make sure we hoist constants out of intrinsics.
+
+define void @test_stxr(i64* %ptr) {
+; CHECK-LABEL: @test_stxr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i64 -9223372036317904832 to i64
+; CHECK-NEXT:    [[PTR_0:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i64 0
+; CHECK-NEXT:    [[CONST_MAT:%.*]] = add i64 [[CONST]], -64
+; CHECK-NEXT:    [[BAR_0:%.*]] = call i32 @llvm.aarch64.stxr.p0i64(i64 [[CONST_MAT]], i64* [[PTR_0]])
+; CHECK-NEXT:    [[PTR_1:%.*]] = getelementptr i64, i64* [[PTR]], i64 1
+; CHECK-NEXT:    [[BAR_1:%.*]] = call i32 @llvm.aarch64.stxr.p0i64(i64 [[CONST]], i64* [[PTR_1]])
+; CHECK-NEXT:    [[PTR_2:%.*]] = getelementptr i64, i64* [[PTR]], i64 2
+; CHECK-NEXT:    [[CONST_MAT1:%.*]] = add i64 [[CONST]], 64
+; CHECK-NEXT:    [[BAR_2:%.*]] = call i32 @llvm.aarch64.stxr.p0i64(i64 [[CONST_MAT1]], i64* [[PTR_2]])
+; CHECK-NEXT:    [[PTR_3:%.*]] = getelementptr i64, i64* [[PTR]], i64 3
+; CHECK-NEXT:    [[CONST_MAT2:%.*]] = add i64 [[CONST]], 128
+; CHECK-NEXT:    [[BAR_3:%.*]] = call i32 @llvm.aarch64.stxr.p0i64(i64 [[CONST_MAT2]], i64* [[PTR_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr.0 = getelementptr i64, i64* %ptr, i64 0
+  %bar.0 = call i32 @llvm.aarch64.stxr.p0i64(i64 -9223372036317904896, i64* %ptr.0)
+  %ptr.1 = getelementptr i64, i64* %ptr, i64 1
+  %bar.1 = call i32 @llvm.aarch64.stxr.p0i64(i64 -9223372036317904832,  i64* %ptr.1)
+  %ptr.2 = getelementptr i64, i64* %ptr, i64 2
+  %bar.2 = call i32 @llvm.aarch64.stxr.p0i64(i64 -9223372036317904768, i64* %ptr.2)
+  %ptr.3 = getelementptr i64, i64* %ptr, i64 3
+  %bar.3 = call i32 @llvm.aarch64.stxr.p0i64(i64 -9223372036317904704, i64* %ptr.3)
+  ret void
+}
+
+declare i32 @llvm.aarch64.stxr.p0i64(i64 , i64*)
+
+define i64 @test_udiv(i64 %x) {
+; CHECK-LABEL: @test_udiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i64 -9223372036317904832 to i64
+; CHECK-NEXT:    [[CONST_MAT:%.*]] = add i64 [[CONST]], -64
+; CHECK-NEXT:    [[BAR_0:%.*]] = call i64 @llvm.aarch64.udiv.i64(i64 [[CONST_MAT]], i64 [[X:%.*]])
+; CHECK-NEXT:    [[BAR_1:%.*]] = call i64 @llvm.aarch64.udiv.i64(i64 [[CONST]], i64 [[X]])
+; CHECK-NEXT:    [[CONST_MAT1:%.*]] = add i64 [[CONST]], 64
+; CHECK-NEXT:    [[BAR_2:%.*]] = call i64 @llvm.aarch64.udiv.i64(i64 [[CONST_MAT1]], i64 [[X]])
+; CHECK-NEXT:    [[CONST_MAT2:%.*]] = add i64 [[CONST]], 128
+; CHECK-NEXT:    [[BAR_3:%.*]] = call i64 @llvm.aarch64.udiv.i64(i64 [[CONST_MAT2]], i64 [[X]])
+; CHECK-NEXT:    [[RES_1:%.*]] = add i64 [[BAR_0]], [[BAR_1]]
+; CHECK-NEXT:    [[RES_2:%.*]] = add i64 [[RES_1]], [[BAR_2]]
+; CHECK-NEXT:    [[RES_3:%.*]] = add i64 [[RES_2]], [[BAR_3]]
+; CHECK-NEXT:    ret i64 [[RES_3]]
+;
+entry:
+  %bar.0 = call i64 @llvm.aarch64.udiv.i64.i64(i64 -9223372036317904896, i64 %x)
+  %bar.1 = call i64 @llvm.aarch64.udiv.i64.i64(i64 -9223372036317904832,  i64 %x)
+  %bar.2 = call i64 @llvm.aarch64.udiv.i64.i64(i64 -9223372036317904768, i64 %x)
+  %bar.3 = call i64 @llvm.aarch64.udiv.i64.i64(i64 -9223372036317904704, i64 %x)
+  %res.1 = add i64 %bar.0, %bar.1
+  %res.2 = add i64 %res.1, %bar.2
+  %res.3 = add i64 %res.2, %bar.3
+  ret i64 %res.3
+}
+
+declare i64 @llvm.aarch64.udiv.i64.i64(i64, i64)
+
+define void @test_free_intrinsics(i64 %x, i8* %ptr) {
+; CHECK-LABEL: @test_free_intrinsics(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 100000000032, i8* [[PTR:%.*]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0i8(i64 100000000064, i8* [[PTR]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0i8(i64 100000000128, i8* [[PTR]])
+; CHECK-NEXT:    [[I:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 100000000256, i8* [[PTR]])
+; CHECK-NEXT:    call void @llvm.invariant.end.p0i8({}* [[I]], i64 100000000256, i8* [[PTR]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.lifetime.start.p0i8(i64 100000000032, i8* %ptr)
+  call void @llvm.lifetime.start.p0i8(i64 100000000064, i8* %ptr)
+  call void @llvm.lifetime.end.p0i8(i64 100000000128, i8* %ptr)
+  %i = call {}* @llvm.invariant.start.p0i8(i64 100000000256, i8* %ptr)
+  call void @llvm.invariant.end.p0i8({}* %i, i64 100000000256, i8* %ptr)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture)
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture)
diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll
index 5572c0124a07c..0adf91cd9aa1d 100644
--- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll
+++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll
@@ -1,6 +1,4 @@
 ; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefix=FNATTR
-; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
-; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -attributor-annotate-decl-cs -functionattrs -S < %s | FileCheck %s --check-prefix=BOTH
 ;
 ; Test cases specifically designed for the "returned" argument attribute.
 ; We use FIXME's to indicate problems and missing attributes.
@@ -8,24 +6,12 @@
 
 ; TEST SCC test returning an integer value argument
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT: define i32 @sink_r0(i32 returned %r)
-; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; BOTH-NEXT: define i32 @scc_r1(i32 %a, i32 returned %r, i32 %b)
-; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; BOTH-NEXT: define i32 @scc_r2(i32 %a, i32 %b, i32 returned %r)
-; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; BOTH-NEXT: define i32 @scc_rX(i32 %a, i32 %b, i32 %r)
 ;
 ; FNATTR: define i32 @sink_r0(i32 returned %r)
 ; FNATTR: define i32 @scc_r1(i32 %a, i32 %r, i32 %b)
 ; FNATTR: define i32 @scc_r2(i32 %a, i32 %b, i32 %r)
 ; FNATTR: define i32 @scc_rX(i32 %a, i32 %b, i32 %r)
 ;
-; ATTRIBUTOR: define i32 @sink_r0(i32 returned %r)
-; ATTRIBUTOR: define i32 @scc_r1(i32 %a, i32 returned %r, i32 %b)
-; ATTRIBUTOR: define i32 @scc_r2(i32 %a, i32 %b, i32 returned %r)
-; ATTRIBUTOR: define i32 @scc_rX(i32 %a, i32 %b, i32 %r)
 ;
 ; int scc_r1(int a, int b, int r);
 ; int scc_r2(int a, int b, int r);
@@ -163,12 +149,6 @@ return:                                           ; preds = %cond.end, %if.then3
 ; FNATTR: define double* @ptr_scc_r1(double* %a, double* readnone %r, double* nocapture readnone %b)
 ; FNATTR: define double* @ptr_scc_r2(double* readnone %a, double* readnone %b, double* readnone %r)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @ptr_sink_r0(double* nofree readnone returned "no-capture-maybe-returned" %r)
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @ptr_scc_r1(double* nofree readnone %a, double* nofree readnone returned %r, double* nocapture nofree readnone %b)
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @ptr_scc_r2(double* nofree readnone %a, double* nofree readnone %b, double* nofree readnone returned %r)
 ;
 ; double* ptr_scc_r1(double* a, double* b, double* r);
 ; double* ptr_scc_r2(double* a, double* b, double* r);
@@ -253,8 +233,6 @@ return:                                           ; preds = %cond.end, %if.then3
 ; }
 ;
 ; FNATTR:  define i32* @rt0(i32* readonly %a)
-; BOTH:      Function Attrs: nofree noinline norecurse noreturn nosync nounwind readonly uwtable
-; BOTH-NEXT: define noalias nonnull align 536870912 dereferenceable(4294967295) i32* @rt0(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %a)
 define i32* @rt0(i32* %a) #0 {
 entry:
   %v = load i32, i32* %a, align 4
@@ -271,8 +249,6 @@ entry:
 ; }
 ;
 ; FNATTR:  define noalias i32* @rt1(i32* nocapture readonly %a)
-; BOTH: Function Attrs: nofree noinline norecurse noreturn nosync nounwind readonly uwtable
-; BOTH-NEXT:    define noalias nonnull align 536870912 dereferenceable(4294967295) i32* @rt1(i32* nocapture nofree nonnull readonly align 4 dereferenceable(4) %a)
 define i32* @rt1(i32* %a) #0 {
 entry:
   %v = load i32, i32* %a, align 4
@@ -286,8 +262,6 @@ entry:
 ;
 ; FNATTR:  define i32* @rt2_helper(i32* %a)
 ; FNATTR:  define i32* @rt2(i32* readnone %a, i32* readnone %b)
-; BOTH:    define i32* @rt2_helper(i32* nofree readnone returned %a)
-; BOTH:    define i32* @rt2(i32* nofree readnone %a, i32* nofree readnone "no-capture-maybe-returned" %b)
 define i32* @rt2_helper(i32* %a) #0 {
 entry:
   %call = call i32* @rt2(i32* %a, i32* %a)
@@ -312,8 +286,6 @@ if.end:
 ;
 ; FNATTR:  define i32* @rt3_helper(i32* %a, i32* %b)
 ; FNATTR:  define i32* @rt3(i32* readnone %a, i32* readnone %b)
-; BOTH:    define i32* @rt3_helper(i32* nofree readnone %a, i32* nofree readnone returned "no-capture-maybe-returned" %b)
-; BOTH:    define i32* @rt3(i32* nofree readnone %a, i32* nofree readnone returned "no-capture-maybe-returned" %b)
 define i32* @rt3_helper(i32* %a, i32* %b) #0 {
 entry:
   %call = call i32* @rt3(i32* %a, i32* %b)
@@ -343,12 +315,8 @@ if.end:
 ;    return r;
 ;  }
 ;
-; BOTH: declare void @unknown_fn(i32* (i32*)*)
 ;
-; BOTH:       Function Attrs: noinline nounwind uwtable
-; BOTH-NEXT:  define i32* @calls_unknown_fn(i32* readnone returned "no-capture-maybe-returned" %r)
 ; FNATTR:     define i32* @calls_unknown_fn(i32* readnone returned %r)
-; ATTRIBUTOR: define i32* @calls_unknown_fn(i32* readnone returned "no-capture-maybe-returned" %r)
 declare void @unknown_fn(i32* (i32*)*) #0
 
 define i32* @calls_unknown_fn(i32* %r) #0 {
@@ -357,41 +325,6 @@ define i32* @calls_unknown_fn(i32* %r) #0 {
 }
 
 
-; TEST call to a function that might be redifined at link time
-;
-;  int *maybe_redefined_fn(int *r) {
-;    return r;
-;  }
-;
-;  int *calls_maybe_redefined_fn(int *r) {
-;    maybe_redefined_fn(r);
-;    return r;
-;  }
-;
-; Verify the maybe-redefined function is not annotated:
-;
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR: define linkonce_odr i32* @maybe_redefined_fn(i32* %r)
-;
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR: define i32* @calls_maybe_redefined_fn(i32* returned %r)
-;
-; BOTH: Function Attrs: noinline nounwind uwtable
-; BOTH-NEXT: define linkonce_odr i32* @maybe_redefined_fn(i32* %r)
-;
-; BOTH: Function Attrs: noinline nounwind uwtable
-; BOTH-NEXT: define i32* @calls_maybe_redefined_fn(i32* returned %r)
-define linkonce_odr i32* @maybe_redefined_fn(i32* %r) #0 {
-entry:
-  ret i32* %r
-}
-
-define i32* @calls_maybe_redefined_fn(i32* %r) #0 {
-entry:
-  %call = call i32* @maybe_redefined_fn(i32* %r)
-  ret i32* %r
-}
-
 ; TEST return call to a function that might be redifined at link time
 ;
 ;  int *maybe_redefined_fn2(int *r) {
@@ -404,13 +337,8 @@ entry:
 ;
 ; Verify the maybe-redefined function is not annotated:
 ;
-; BOTH: Function Attrs: noinline nounwind uwtable
-; BOTH-NEXT: define linkonce_odr i32* @maybe_redefined_fn2(i32* %r)
-; BOTH: Function Attrs: noinline nounwind uwtable
-; BOTH-NEXT: define i32* @calls_maybe_redefined_fn2(i32* %r)
 ;
 ; FNATTR:     define i32* @calls_maybe_redefined_fn2(i32* %r)
-; ATTRIBUTOR: define i32* @calls_maybe_redefined_fn2(i32* %r)
 define linkonce_odr i32* @maybe_redefined_fn2(i32* %r) #0 {
 entry:
   ret i32* %r
@@ -432,12 +360,8 @@ entry:
 ;   return b == 0? b : x;
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT: define double @select_and_phi(double returned %b)
 ;
 ; FNATTR:     define double @select_and_phi(double %b)
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double @select_and_phi(double returned %b)
 define double @select_and_phi(double %b) #0 {
 entry:
   %cmp = fcmp ogt double %b, 0.000000e+00
@@ -463,13 +387,9 @@ if.end:                                           ; preds = %if.then, %entry
 ;   return b == 0? b : x;
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; BOTH-NEXT: define double @recursion_select_and_phi(i32 %a, double returned %b)
 ;
 ; FNATTR:     define double @recursion_select_and_phi(i32 %a, double %b)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double @recursion_select_and_phi(i32 %a, double returned %b)
 define double @recursion_select_and_phi(i32 %a, double %b) #0 {
 entry:
   %dec = add nsw i32 %a, -1
@@ -494,13 +414,9 @@ if.end:                                           ; preds = %if.then, %entry
 ;   return (double*)b;
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT:  define double* @bitcast(i32* nofree readnone returned "no-capture-maybe-returned" %b)
 ;
 ; FNATTR:     define double* @bitcast(i32* readnone %b)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @bitcast(i32* nofree readnone returned "no-capture-maybe-returned" %b)
 define double* @bitcast(i32* %b) #0 {
 entry:
   %bc0 = bitcast i32* %b to double*
@@ -517,13 +433,9 @@ entry:
 ;   return b != 0 ? b : x;
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT: define double* @bitcasts_select_and_phi(i32* nofree readnone returned %b)
 ;
 ; FNATTR:     define double* @bitcasts_select_and_phi(i32* readnone %b)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @bitcasts_select_and_phi(i32* nofree readnone returned %b)
 define double* @bitcasts_select_and_phi(i32* %b) #0 {
 entry:
   %bc0 = bitcast i32* %b to double*
@@ -555,13 +467,9 @@ if.end:                                           ; preds = %if.then, %entry
 ;   /* return undef */
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT:  define double* @ret_arg_arg_undef(i32* nofree readnone returned %b)
 ;
 ; FNATTR:     define double* @ret_arg_arg_undef(i32* readnone %b)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @ret_arg_arg_undef(i32* nofree readnone returned %b)
 define double* @ret_arg_arg_undef(i32* %b) #0 {
 entry:
   %bc0 = bitcast i32* %b to double*
@@ -593,13 +501,9 @@ ret_undef:
 ;   /* return undef */
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT:  define double* @ret_undef_arg_arg(i32* nofree readnone returned %b)
 ;
 ; FNATTR:     define double* @ret_undef_arg_arg(i32* readnone %b)
 ;
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define double* @ret_undef_arg_arg(i32* nofree readnone returned %b)
 define double* @ret_undef_arg_arg(i32* %b) #0 {
 entry:
   %bc0 = bitcast i32* %b to double*
@@ -631,11 +535,8 @@ ret_arg1:
 ;   /* return undef */
 ; }
 ;
-; BOTH: Function Attrs: nofree noinline norecurse nosync nounwind readnone uwtable
-; BOTH-NEXT:  define double* @ret_undef_arg_undef(i32* nofree readnone returned %b)
 ;
 ; FNATTR:     define double* @ret_undef_arg_undef(i32* readnone %b)
-; ATTRIBUTOR: define double* @ret_undef_arg_undef(i32* nofree readnone returned %b)
 define double* @ret_undef_arg_undef(i32* %b) #0 {
 entry:
   %bc0 = bitcast i32* %b to double*
@@ -667,10 +568,6 @@ ret_undef1:
 ;
 ; FNATTR:     define i32* @ret_arg_or_unknown(i32* %b)
 ; FNATTR:     define i32* @ret_arg_or_unknown_through_phi(i32* %b)
-; ATTRIBUTOR: define i32* @ret_arg_or_unknown(i32* %b)
-; ATTRIBUTOR: define i32* @ret_arg_or_unknown_through_phi(i32* %b)
-; BOTH:       define i32* @ret_arg_or_unknown(i32* %b)
-; BOTH:       define i32* @ret_arg_or_unknown_through_phi(i32* %b)
 declare i32* @unknown(i32*)
 
 define i32* @ret_arg_or_unknown(i32* %b) #0 {
@@ -709,14 +606,6 @@ r:
 ; FNATTR:     define i32 @deadblockcall2(i32 %A)
 ; FNATTR:     define i32 @deadblockphi1(i32 %A)
 ; FNATTR:     define i32 @deadblockphi2(i32 %A)
-; ATTRIBUTOR: define i32 @deadblockcall1(i32 returned %A)
-; ATTRIBUTOR: define i32 @deadblockcall2(i32 returned %A)
-; ATTRIBUTOR: define i32 @deadblockphi1(i32 returned %A)
-; ATTRIBUTOR: define i32 @deadblockphi2(i32 returned %A)
-; BOTH:       define i32 @deadblockcall1(i32 returned %A)
-; BOTH:       define i32 @deadblockcall2(i32 returned %A)
-; BOTH:       define i32 @deadblockphi1(i32 returned %A)
-; BOTH:       define i32 @deadblockphi2(i32 returned %A)
 define i32 @deadblockcall1(i32 %A) #0 {
 entry:
   ret i32 %A
@@ -789,51 +678,4 @@ r:
   ret i32 %PHI2
 }
 
-define weak_odr i32 @non_exact_0() {
-  ret i32 0
-}
-define weak_odr i32 @non_exact_1(i32 %a) {
-  ret i32 %a
-}
-define weak_odr i32 @non_exact_2(i32 returned %a) {
-  ret i32 %a
-}
-define weak_odr i32* @non_exact_3(i32* align 32 returned %a) {
-  ret i32* %a
-}
-define i32 @exact(i32* %a) {
-  %c0 = call i32 @non_exact_0()
-  %c1 = call i32 @non_exact_1(i32 1)
-  %c2 = call i32 @non_exact_2(i32 2)
-  %c3 = call i32* @non_exact_3(i32* %a)
-; We can use the information of the weak function non_exact_3 because it was
-; given to us and not derived (the alignment of the returned argument).
-; ATTRIBUTOR:  %c4 = load i32, i32* %c3, align 32
-  %c4 = load i32, i32* %c3
-; FIXME: %c2 and %c3 should be replaced but not %c0 or %c1!
-; ATTRIBUTOR:  %add1 = add i32 %c0, %c1
-; ATTRIBUTOR:  %add2 = add i32 %add1, %c2
-; ATTRIBUTOR:  %add3 = add i32 %add2, %c4
-  %add1 = add i32 %c0, %c1
-  %add2 = add i32 %add1, %c2
-  %add3 = add i32 %add2, %c4
-  ret i32 %add3
-}
-
-@G = external global i8
-define i32* @ret_const() #0 {
-  %bc = bitcast i8* @G to i32*
-  ret i32* %bc
-}
-define i32* @use_const() #0 {
-  %c = call i32* @ret_const()
-  ; ATTRIBUTOR: ret i32* bitcast (i8* @G to i32*)
-  ret i32* %c
-}
-define i32* @dont_use_const() #0 {
-  %c = musttail call i32* @ret_const()
-  ; ATTRIBUTOR: ret i32* %c
-  ret i32* %c
-}
-
 attributes #0 = { noinline nounwind uwtable }
diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
index c54559883ae3e..25759cffe13f3 100644
--- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll
@@ -1,18 +1,14 @@
-; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefixes=FNATTR,EITHER
-; RUN: opt -passes=function-attrs -S < %s | FileCheck %s --check-prefixes=FNATTR,EITHER
-; RUN: opt -attributor -attributor-manifest-internal -attributor-disable=false -S -attributor-annotate-decl-cs < %s | FileCheck %s --check-prefixes=ATTRIBUTOR,EITHER
-; RUN: opt -passes=attributor -attributor-manifest-internal -attributor-disable=false -S -attributor-annotate-decl-cs < %s | FileCheck %s --check-prefixes=ATTRIBUTOR,EITHER
+; RUN: opt -functionattrs -S < %s | FileCheck %s --check-prefixes=FNATTR
+; RUN: opt -passes=function-attrs -S < %s | FileCheck %s --check-prefixes=FNATTR
 
 @g = global i32* null		; <i32**> [#uses=1]
 
 ; FNATTR: define i32* @c1(i32* readnone returned %q)
-; ATTRIBUTOR: define i32* @c1(i32* nofree readnone returned "no-capture-maybe-returned" %q)
 define i32* @c1(i32* %q) {
 	ret i32* %q
 }
 
 ; FNATTR: define void @c2(i32* %q)
-; ATTRIBUTOR: define void @c2(i32* nofree writeonly %q)
 ; It would also be acceptable to mark %q as readnone. Update @c3 too.
 define void @c2(i32* %q) {
 	store i32* %q, i32** @g
@@ -20,14 +16,12 @@ define void @c2(i32* %q) {
 }
 
 ; FNATTR: define void @c3(i32* %q)
-; ATTRIBUTOR: define void @c3(i32* nofree writeonly %q)
 define void @c3(i32* %q) {
 	call void @c2(i32* %q)
 	ret void
 }
 
 ; FNATTR: define i1 @c4(i32* %q, i32 %bitno)
-; ATTRIBUTOR: define i1 @c4(i32* nofree readnone %q, i32 %bitno)
 define i1 @c4(i32* %q, i32 %bitno) {
 	%tmp = ptrtoint i32* %q to i32
 	%tmp2 = lshr i32 %tmp, %bitno
@@ -41,7 +35,6 @@ l1:
 
 ; c4b is c4 but without the escaping part
 ; FNATTR: define i1 @c4b(i32* %q, i32 %bitno)
-; ATTRIBUTOR: define i1 @c4b(i32* nocapture nofree readnone %q, i32 %bitno)
 define i1 @c4b(i32* %q, i32 %bitno) {
 	%tmp = ptrtoint i32* %q to i32
 	%tmp2 = lshr i32 %tmp, %bitno
@@ -56,7 +49,6 @@ l1:
 @lookup_table = global [2 x i1] [ i1 0, i1 1 ]
 
 ; FNATTR: define i1 @c5(i32* %q, i32 %bitno)
-; ATTRIBUTOR: define i1 @c5(i32* nofree readonly %q, i32 %bitno)
 define i1 @c5(i32* %q, i32 %bitno) {
 	%tmp = ptrtoint i32* %q to i32
 	%tmp2 = lshr i32 %tmp, %bitno
@@ -69,7 +61,7 @@ define i1 @c5(i32* %q, i32 %bitno) {
 
 declare void @throw_if_bit_set(i8*, i8) readonly
 
-; EITHER: define i1 @c6(i8* readonly %q, i8 %bit)
+; FNATTR: define i1 @c6(i8* readonly %q, i8 %bit)
 define i1 @c6(i8* %q, i8 %bit) personality i32 (...)* @__gxx_personality_v0 {
 	invoke void @throw_if_bit_set(i8* %q, i8 %bit)
 		to label %ret0 unwind label %ret1
@@ -92,7 +84,6 @@ define i1* @lookup_bit(i32* %q, i32 %bitno) readnone nounwind {
 }
 
 ; FNATTR: define i1 @c7(i32* readonly %q, i32 %bitno)
-; ATTRIBUTOR: define i1 @c7(i32* nofree readonly %q, i32 %bitno)
 define i1 @c7(i32* %q, i32 %bitno) {
 	%ptr = call i1* @lookup_bit(i32* %q, i32 %bitno)
 	%val = load i1, i1* %ptr
@@ -101,7 +92,6 @@ define i1 @c7(i32* %q, i32 %bitno) {
 
 
 ; FNATTR: define i32 @nc1(i32* %q, i32* nocapture %p, i1 %b)
-; ATTRIBUTOR: define i32 @nc1(i32* nofree %q, i32* nocapture nofree %p, i1 %b)
 define i32 @nc1(i32* %q, i32* %p, i1 %b) {
 e:
 	br label %l
@@ -117,7 +107,6 @@ l:
 }
 
 ; FNATTR: define i32 @nc1_addrspace(i32* %q, i32 addrspace(1)* nocapture %p, i1 %b)
-; ATTRIBUTOR: define i32 @nc1_addrspace(i32* nofree %q, i32 addrspace(1)* nocapture nofree %p, i1 %b)
 define i32 @nc1_addrspace(i32* %q, i32 addrspace(1)* %p, i1 %b) {
 e:
 	br label %l
@@ -133,7 +122,6 @@ l:
 }
 
 ; FNATTR: define void @nc2(i32* nocapture %p, i32* %q)
-; ATTRIBUTOR: define void @nc2(i32* nocapture nofree %p, i32* nofree %q)
 define void @nc2(i32* %p, i32* %q) {
 	%1 = call i32 @nc1(i32* %q, i32* %p, i1 0)		; <i32> [#uses=0]
 	ret void
@@ -141,21 +129,19 @@ define void @nc2(i32* %p, i32* %q) {
 
 
 ; FNATTR: define void @nc3(void ()* nocapture %p)
-; ATTRIBUTOR: define void @nc3(void ()* nocapture nofree nonnull %p)
 define void @nc3(void ()* %p) {
 	call void %p()
 	ret void
 }
 
 declare void @external(i8*) readonly nounwind
-; EITHER: define void @nc4(i8* nocapture readonly %p)
+; FNATTR: define void @nc4(i8* nocapture readonly %p)
 define void @nc4(i8* %p) {
 	call void @external(i8* %p)
 	ret void
 }
 
 ; FNATTR: define void @nc5(void (i8*)* nocapture %f, i8* nocapture %p)
-; ATTRIBUTOR: define void @nc5(void (i8*)* nocapture nofree nonnull %f, i8* nocapture %p)
 define void @nc5(void (i8*)* %f, i8* %p) {
 	call void %f(i8* %p) readonly nounwind
 	call void %f(i8* nocapture %p)
@@ -163,7 +149,6 @@ define void @nc5(void (i8*)* %f, i8* %p) {
 }
 
 ; FNATTR:     define void @test1_1(i8* nocapture readnone %x1_1, i8* %y1_1, i1 %c)
-; ATTRIBUTOR: define void @test1_1(i8* nocapture nofree readnone %x1_1, i8* nocapture nofree readnone %y1_1, i1 %c)
 ; It would be acceptable to add readnone to %y1_1 and %y1_2.
 define void @test1_1(i8* %x1_1, i8* %y1_1, i1 %c) {
   call i8* @test1_2(i8* %x1_1, i8* %y1_1, i1 %c)
@@ -172,7 +157,6 @@ define void @test1_1(i8* %x1_1, i8* %y1_1, i1 %c) {
 }
 
 ; FNATTR: define i8* @test1_2(i8* nocapture readnone %x1_2, i8* returned %y1_2, i1 %c)
-; ATTRIBUTOR: define i8* @test1_2(i8* nocapture nofree readnone %x1_2, i8* nofree readnone returned "no-capture-maybe-returned" %y1_2, i1 %c)
 define i8* @test1_2(i8* %x1_2, i8* %y1_2, i1 %c) {
   br i1 %c, label %t, label %f
 t:
@@ -184,7 +168,6 @@ f:
 }
 
 ; FNATTR: define void @test2(i8* nocapture readnone %x2)
-; ATTRIBUTOR: define void @test2(i8* nocapture nofree readnone %x2)
 define void @test2(i8* %x2) {
   call void @test2(i8* %x2)
   store i32* null, i32** @g
@@ -192,7 +175,6 @@ define void @test2(i8* %x2) {
 }
 
 ; FNATTR: define void @test3(i8* nocapture readnone %x3, i8* nocapture readnone %y3, i8* nocapture readnone %z3)
-; ATTRIBUTOR: define void @test3(i8* nocapture nofree readnone %x3, i8* nocapture nofree readnone %y3, i8* nocapture nofree readnone %z3)
 define void @test3(i8* %x3, i8* %y3, i8* %z3) {
   call void @test3(i8* %z3, i8* %y3, i8* %x3)
   store i32* null, i32** @g
@@ -200,7 +182,6 @@ define void @test3(i8* %x3, i8* %y3, i8* %z3) {
 }
 
 ; FNATTR: define void @test4_1(i8* %x4_1, i1 %c)
-; ATTRIBUTOR: define void @test4_1(i8* nocapture nofree readnone %x4_1, i1 %c)
 define void @test4_1(i8* %x4_1, i1 %c) {
   call i8* @test4_2(i8* %x4_1, i8* %x4_1, i8* %x4_1, i1 %c)
   store i32* null, i32** @g
@@ -208,7 +189,6 @@ define void @test4_1(i8* %x4_1, i1 %c) {
 }
 
 ; FNATTR: define i8* @test4_2(i8* nocapture readnone %x4_2, i8* readnone returned %y4_2, i8* nocapture readnone %z4_2, i1 %c)
-; ATTRIBUTOR: define i8* @test4_2(i8* nocapture nofree readnone %x4_2, i8* nofree readnone returned "no-capture-maybe-returned" %y4_2, i8* nocapture nofree readnone %z4_2, i1 %c)
 define i8* @test4_2(i8* %x4_2, i8* %y4_2, i8* %z4_2, i1 %c) {
   br i1 %c, label %t, label %f
 t:
@@ -221,7 +201,7 @@ f:
 
 declare i8* @test5_1(i8* %x5_1)
 
-; EITHER: define void @test5_2(i8* %x5_2)
+; FNATTR: define void @test5_2(i8* %x5_2)
 define void @test5_2(i8* %x5_2) {
   call i8* @test5_1(i8* %x5_2)
   store i32* null, i32** @g
@@ -230,7 +210,7 @@ define void @test5_2(i8* %x5_2) {
 
 declare void @test6_1(i8* %x6_1, i8* nocapture %y6_1, ...)
 
-; EITHER: define void @test6_2(i8* %x6_2, i8* nocapture %y6_2, i8* %z6_2)
+; FNATTR: define void @test6_2(i8* %x6_2, i8* nocapture %y6_2, i8* %z6_2)
 define void @test6_2(i8* %x6_2, i8* %y6_2, i8* %z6_2) {
   call void (i8*, i8*, ...) @test6_1(i8* %x6_2, i8* %y6_2, i8* %z6_2)
   store i32* null, i32** @g
@@ -238,28 +218,24 @@ define void @test6_2(i8* %x6_2, i8* %y6_2, i8* %z6_2) {
 }
 
 ; FNATTR: define void @test_cmpxchg(i32* nocapture %p)
-; ATTRIBUTOR: define void @test_cmpxchg(i32* nocapture nofree nonnull dereferenceable(4) %p)
 define void @test_cmpxchg(i32* %p) {
   cmpxchg i32* %p, i32 0, i32 1 acquire monotonic
   ret void
 }
 
 ; FNATTR: define void @test_cmpxchg_ptr(i32** nocapture %p, i32* %q)
-; ATTRIBUTOR: define void @test_cmpxchg_ptr(i32** nocapture nofree nonnull dereferenceable(8) %p, i32* nofree %q)
 define void @test_cmpxchg_ptr(i32** %p, i32* %q) {
   cmpxchg i32** %p, i32* null, i32* %q acquire monotonic
   ret void
 }
 
 ; FNATTR: define void @test_atomicrmw(i32* nocapture %p)
-; ATTRIBUTOR: define void @test_atomicrmw(i32* nocapture nofree nonnull dereferenceable(4) %p)
 define void @test_atomicrmw(i32* %p) {
   atomicrmw add i32* %p, i32 1 seq_cst
   ret void
 }
 
 ; FNATTR: define void @test_volatile(i32* %x)
-; ATTRIBUTOR: define void @test_volatile(i32* nofree %x)
 define void @test_volatile(i32* %x) {
 entry:
   %gep = getelementptr i32, i32* %x, i64 1
@@ -267,7 +243,7 @@ entry:
   ret void
 }
 
-; EITHER: nocaptureLaunder(i8* nocapture %p)
+; FNATTR: nocaptureLaunder(i8* nocapture %p)
 define void @nocaptureLaunder(i8* %p) {
 entry:
   %b = call i8* @llvm.launder.invariant.group.p0i8(i8* %p)
@@ -276,7 +252,7 @@ entry:
 }
 
 @g2 = global i8* null
-; EITHER: define void @captureLaunder(i8* %p)
+; FNATTR: define void @captureLaunder(i8* %p)
 define void @captureLaunder(i8* %p) {
   %b = call i8* @llvm.launder.invariant.group.p0i8(i8* %p)
   store i8* %b, i8** @g2
@@ -284,7 +260,6 @@ define void @captureLaunder(i8* %p) {
 }
 
 ; FNATTR: @nocaptureStrip(i8* nocapture %p)
-; ATTRIBUTOR: @nocaptureStrip(i8* nocapture writeonly %p)
 define void @nocaptureStrip(i8* %p) {
 entry:
   %b = call i8* @llvm.strip.invariant.group.p0i8(i8* %p)
@@ -294,7 +269,6 @@ entry:
 
 @g3 = global i8* null
 ; FNATTR: define void @captureStrip(i8* %p)
-; ATTRIBUTOR: define void @captureStrip(i8* writeonly %p)
 define void @captureStrip(i8* %p) {
   %b = call i8* @llvm.strip.invariant.group.p0i8(i8* %p)
   store i8* %b, i8** @g3
@@ -302,21 +276,18 @@ define void @captureStrip(i8* %p) {
 }
 
 ; FNATTR: define i1 @captureICmp(i32* readnone %x)
-; ATTRIBUTOR: define i1 @captureICmp(i32* nofree readnone %x)
 define i1 @captureICmp(i32* %x) {
   %1 = icmp eq i32* %x, null
   ret i1 %1
 }
 
 ; FNATTR: define i1 @captureICmpRev(i32* readnone %x)
-; ATTRIBUTOR: define i1 @captureICmpRev(i32* nofree readnone %x)
 define i1 @captureICmpRev(i32* %x) {
   %1 = icmp eq i32* null, %x
   ret i1 %1
 }
 
 ; FNATTR: define i1 @nocaptureInboundsGEPICmp(i32* nocapture readnone %x)
-; ATTRIBUTOR: define i1 @nocaptureInboundsGEPICmp(i32* nocapture nofree nonnull readnone %x)
 define i1 @nocaptureInboundsGEPICmp(i32* %x) {
   %1 = getelementptr inbounds i32, i32* %x, i32 5
   %2 = bitcast i32* %1 to i8*
@@ -325,7 +296,6 @@ define i1 @nocaptureInboundsGEPICmp(i32* %x) {
 }
 
 ; FNATTR: define i1 @nocaptureInboundsGEPICmpRev(i32* nocapture readnone %x)
-; ATTRIBUTOR: define i1 @nocaptureInboundsGEPICmpRev(i32* nocapture nofree nonnull readnone %x)
 define i1 @nocaptureInboundsGEPICmpRev(i32* %x) {
   %1 = getelementptr inbounds i32, i32* %x, i32 5
   %2 = bitcast i32* %1 to i8*
@@ -334,7 +304,6 @@ define i1 @nocaptureInboundsGEPICmpRev(i32* %x) {
 }
 
 ; FNATTR: define i1 @nocaptureDereferenceableOrNullICmp(i32* nocapture readnone dereferenceable_or_null(4) %x)
-; ATTRIBUTOR: define i1 @nocaptureDereferenceableOrNullICmp(i32* nocapture nofree readnone dereferenceable_or_null(4) %x)
 define i1 @nocaptureDereferenceableOrNullICmp(i32* dereferenceable_or_null(4) %x) {
   %1 = bitcast i32* %x to i8*
   %2 = icmp eq i8* %1, null
@@ -342,36 +311,11 @@ define i1 @nocaptureDereferenceableOrNullICmp(i32* dereferenceable_or_null(4) %x
 }
 
 ; FNATTR: define i1 @captureDereferenceableOrNullICmp(i32* readnone dereferenceable_or_null(4) %x)
-; ATTRIBUTOR: define i1 @captureDereferenceableOrNullICmp(i32* nofree readnone dereferenceable_or_null(4) %x)
 define i1 @captureDereferenceableOrNullICmp(i32* dereferenceable_or_null(4) %x) "null-pointer-is-valid"="true" {
   %1 = bitcast i32* %x to i8*
   %2 = icmp eq i8* %1, null
   ret i1 %2
 }
 
-declare void @unknown(i8*)
-define void @test_callsite() {
-entry:
-; We know that 'null' in AS 0 does not alias anything and cannot be captured. Though the latter is not qurried -> derived atm.
-; ATTRIBUTOR: call void @unknown(i8* noalias null)
-  call void @unknown(i8* null)
-  ret void
-}
-
-declare i8* @unknownpi8pi8(i8*,i8* returned)
-define i8* @test_returned1(i8* %A, i8* returned %B) nounwind readonly {
-; ATTRIBUTOR: define i8* @test_returned1(i8* nocapture readonly %A, i8* readonly returned %B)
-entry:
-  %p = call i8* @unknownpi8pi8(i8* %A, i8* %B)
-  ret i8* %p
-}
-
-define i8* @test_returned2(i8* %A, i8* %B) {
-; ATTRIBUTOR: define i8* @test_returned2(i8* nocapture readonly %A, i8* readonly returned %B)
-entry:
-  %p = call i8* @unknownpi8pi8(i8* %A, i8* %B) nounwind readonly
-  ret i8* %p
-}
-
 declare i8* @llvm.launder.invariant.group.p0i8(i8*)
 declare i8* @llvm.strip.invariant.group.p0i8(i8*)
diff --git a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
index 1c007ee11b41f..8ac037e5cd8dc 100644
--- a/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nofree-attributor.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -functionattrs --disable-nofree-inference=false -S < %s | FileCheck %s --check-prefix=FNATTR
-; RUN: opt -attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefix=ATTRIBUTOR
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -15,8 +14,6 @@ declare void @_ZdaPv(i8*) local_unnamed_addr #2
 ; TEST 1 (positive case)
 ; FNATTR: Function Attrs: noinline norecurse nounwind readnone uwtable
 ; FNATTR-NEXT: define void @only_return()
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @only_return()
 define void @only_return() #0 {
     ret void
 }
@@ -30,9 +27,6 @@ define void @only_return() #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define void @only_free(i8* nocapture %0) local_unnamed_addr
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT: define void @only_free(i8* nocapture %0) local_unnamed_addr #1
 define void @only_free(i8* nocapture %0) local_unnamed_addr #0 {
     tail call void @free(i8* %0) #1
     ret void
@@ -52,9 +46,6 @@ define void @only_free(i8* nocapture %0) local_unnamed_addr #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT :define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr
 define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr #0 {
   tail call void @free_in_scc2(i8* %0) #1
   ret void
@@ -63,9 +54,6 @@ define void @free_in_scc1(i8* nocapture %0) local_unnamed_addr #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define void @free_in_scc2(i8* nocapture %0) local_unnamed_addr
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR: define void @free_in_scc2(i8* nocapture %0) local_unnamed_addr
 define void @free_in_scc2(i8* nocapture %0) local_unnamed_addr #0 {
   %cmp = icmp eq i8* %0, null
   br i1 %cmp, label %rec, label %call
@@ -92,8 +80,6 @@ end:
 
 ; FNATTR: Function Attrs: noinline nounwind readnone uwtable
 ; FNATTR-NEXT: define void @mutual_recursion1()
-; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @mutual_recursion1()
 define void @mutual_recursion1() #0 {
   call void @mutual_recursion2()
   ret void
@@ -101,8 +87,6 @@ define void @mutual_recursion1() #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind readnone uwtable
 ; FNATTR-NEXT: define void @mutual_recursion2()
-; ATTRIBUTOR: Function Attrs: nofree noinline noreturn nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @mutual_recursion2()
 define void @mutual_recursion2() #0 {
   call void @mutual_recursion1()
   ret void
@@ -117,9 +101,6 @@ define void @mutual_recursion2() #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define void @_Z9delete_opPc(i8* %0) local_unnamed_addr
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT: define void @_Z9delete_opPc(i8* %0) local_unnamed_addr #1
 define void @_Z9delete_opPc(i8* %0) local_unnamed_addr #0 {
   %2 = icmp eq i8* %0, null
   br i1 %2, label %4, label %3
@@ -137,9 +118,6 @@ define void @_Z9delete_opPc(i8* %0) local_unnamed_addr #0 {
 ; Call realloc
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT: define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr
 define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr #0 {
     %ret = tail call i8* @realloc(i8* %0, i64 %1) #2
     ret i8* %ret
@@ -152,14 +130,10 @@ define noalias i8* @call_realloc(i8* nocapture %0, i64 %1) local_unnamed_addr #0
 
 ; FNATTR: Function Attrs: nofree noinline nounwind readnone uwtable
 ; FNATTR-NEXT: declare void @nofree_function()
-; ATTRIBUTOR: Function Attrs:  nofree noinline nounwind readnone uwtable 
-; ATTRIBUTOR-NEXT: declare void @nofree_function()
 declare void @nofree_function() nofree readnone #0
 
 ; FNATTR: Function Attrs: noinline nounwind readnone uwtable
 ; FNATTR-NEXT: define void @call_nofree_function()
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @call_nofree_function()
 define void @call_nofree_function() #0 {
     tail call void @nofree_function()
     ret void
@@ -169,16 +143,11 @@ define void @call_nofree_function() #0 {
 ; Call function declaration without "nofree"
 
 
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NEXT: declare void @maybe_free()
 declare void @maybe_free() #0
 
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR: define void @call_maybe_free()
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT: define void @call_maybe_free()
 define void @call_maybe_free() #0 {
     tail call void @maybe_free()
     ret void
@@ -190,9 +159,6 @@ define void @call_maybe_free() #0 {
 
 ; FNATTR: Function Attrs: noinline nounwind uwtable
 ; FNATTR-NEXT: define void @call_both()
-; ATTRIBUTOR: Function Attrs: noinline nounwind uwtable
-; ATTRIBUTOR-NOT: nofree
-; ATTRIBUTOR-NEXT: define void @call_both()
 define void @call_both() #0 {
     tail call void @maybe_free()
     tail call void @nofree_function()
@@ -204,15 +170,11 @@ define void @call_both() #0 {
 ; Call intrinsic function
 ; FNATTRS: Function Attrs: noinline readnone speculatable
 ; FNATTRS-NEXT: declare float @llvm.floor.f32(float %0)
-; ATTRIBUTOR: Function Attrs: nounwind readnone speculatable
-; ATTRIBUTOR-NEXT: declare float @llvm.floor.f32(float)
 declare float @llvm.floor.f32(float)
 
 ; FNATTRS: Function Attrs: noinline nounwind uwtable
 ; FNATTRS-NEXT: define void @call_floor(float %a)
 ; FIXME: missing nofree
-; ATTRIBUTOR: Function Attrs: noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @call_floor(float %a)
 
 define void @call_floor(float %a) #0 {
     tail call float @llvm.floor.f32(float %a)
@@ -224,8 +186,6 @@ define void @call_floor(float %a) #0 {
 
 ; FNATTRS: Function Attrs: noinline nounwind uwtable
 ; FNATTRS-NEXT: define void @f1()
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @f1()
 define void @f1() #0 {
     tail call void @nofree_function()
     ret void
@@ -233,40 +193,11 @@ define void @f1() #0 {
 
 ; FNATTRS: Function Attrs: noinline nounwind uwtable
 ; FNATTRS-NEXT: define void @f2()
-; ATTRIBUTOR: Function Attrs: nofree noinline nosync nounwind readnone uwtable
-; ATTRIBUTOR-NEXT: define void @f2()
 define void @f2() #0 {
     tail call void @f1()
     ret void
 }
 
-; TEST 12 NoFree argument - positive.
-; ATTRIBUTOR: define double @test12(double* nocapture nofree nonnull readonly align 8 dereferenceable(8) %a)
-define double @test12(double* nocapture readonly %a) {
-entry:
-	%0 = load double, double* %a, align 8
-	%call = tail call double @cos(double %0) #2
-	ret double %call
-}
-
-declare double @cos(double) nobuiltin nounwind nofree
-
-; FIXME: %a should be nofree.
-; TEST 13 NoFree argument - positive.
-; ATTRIBUTOR: define noalias i32* @test13(i64* nocapture nonnull readonly align 8 dereferenceable(8) %a)
-define noalias i32* @test13(i64* nocapture readonly %a) {
-entry:
-	%0 = load i64, i64* %a, align 8
-	%call = tail call noalias i8* @malloc(i64 %0) #2
-	%1 = bitcast i8* %call to i32*
-	ret i32* %1
-}
-
-; ATTRIBUTOR: define void @test14(i8* nocapture %0, i8* nocapture nofree readnone %1)
-define void @test14(i8* nocapture %0, i8* nocapture %1) {
-	tail call void @free(i8* %0) #1
-	ret void
-}
 
 declare noalias i8* @malloc(i64)
 
diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
index 42923cee77086..128a2bdbe50c6 100644
--- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -functionattrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=BOTH,FNATTR,OLD
-; RUN: opt -S -passes=function-attrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=BOTH,FNATTR,OLD
-; RUN: opt -attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=BOTH,OLD,ATTRIBUTOR,ATTRIBUTOR_OPM
-; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=5 -S < %s | FileCheck %s --check-prefixes=BOTH,ATTRIBUTOR,ATTRIBUTOR_NPM
+; RUN: opt -S -functionattrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=FNATTR
+; RUN: opt -S -passes=function-attrs -enable-nonnull-arg-prop %s | FileCheck %s --check-prefixes=FNATTR
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -10,14 +8,14 @@ declare nonnull i8* @ret_nonnull()
 
 ; Return a pointer trivially nonnull (call return attribute)
 define i8* @test1() {
-; BOTH: define nonnull i8* @test1
+; FNATTR: define nonnull i8* @test1
   %ret = call i8* @ret_nonnull()
   ret i8* %ret
 }
 
 ; Return a pointer trivially nonnull (argument attribute)
 define i8* @test2(i8* nonnull %p) {
-; BOTH: define nonnull i8* @test2
+; FNATTR: define nonnull i8* @test2
   ret i8* %p
 }
 
@@ -25,7 +23,6 @@ define i8* @test2(i8* nonnull %p) {
 ; can we still mark the other one which is trivially nonnull
 define i8* @scc_binder(i1 %c) {
 ; FNATTR: define i8* @scc_binder
-; ATTRIBUTOR: define noalias i8* @scc_binder
   br i1 %c, label %rec, label %end
 rec:
   call i8* @test3(i1 %c)
@@ -35,7 +32,7 @@ end:
 }
 
 define i8* @test3(i1 %c) {
-; BOTH: define nonnull i8* @test3
+; FNATTR: define nonnull i8* @test3
   call i8* @scc_binder(i1 %c)
   %ret = call i8* @ret_nonnull()
   ret i8* %ret
@@ -46,14 +43,12 @@ define i8* @test3(i1 %c) {
 ; just never return period.)
 define i8* @test4_helper() {
 ; FNATTR: define noalias nonnull i8* @test4_helper
-; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test4_helper
   %ret = call i8* @test4()
   ret i8* %ret
 }
 
 define i8* @test4() {
 ; FNATTR: define noalias nonnull i8* @test4
-; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test4
   %ret = call i8* @test4_helper()
   ret i8* %ret
 }
@@ -62,7 +57,6 @@ define i8* @test4() {
 ; make sure we haven't marked them as nonnull.
 define i8* @test5_helper(i1 %c) {
 ; FNATTR: define noalias i8* @test5_helper
-; ATTRIBUTOR: define noalias i8* @test5_helper
   br i1 %c, label %rec, label %end
 rec:
   %ret = call i8* @test5(i1 %c)
@@ -73,14 +67,11 @@ end:
 
 define i8* @test5(i1 %c) {
 ; FNATTR: define noalias i8* @test5
-; ATTRIBUTOR: define noalias i8* @test5
   %ret = call i8* @test5_helper(i1 %c)
   ret i8* %ret
 }
 
 ; Local analysis, but going through a self recursive phi
-; ATTRIBUTOR: Function Attrs: noreturn
-; ATTRIBUTOR: define noalias nonnull align 536870912 dereferenceable(4294967295) i8* @test6a()
 define i8* @test6a() {
 entry:
   %ret = call i8* @ret_nonnull()
@@ -92,7 +83,6 @@ exit:
   ret i8* %phi
 }
 
-; ATTRIBUTOR: define nonnull i8* @test6b(i1 %c)
 define i8* @test6b(i1 %c) {
 entry:
   %ret = call i8* @ret_nonnull()
@@ -104,19 +94,19 @@ exit:
   ret i8* %phi
 }
 
-; BOTH: define i8* @test7
+; FNATTR: define i8* @test7
 define i8* @test7(i8* %a) {
   %b = getelementptr inbounds i8, i8* %a, i64 0
   ret i8* %b
 }
 
-; BOTH: define nonnull i8* @test8
+; FNATTR: define nonnull i8* @test8
 define i8* @test8(i8* %a) {
   %b = getelementptr inbounds i8, i8* %a, i64 1
   ret i8* %b
 }
 
-; BOTH: define i8* @test9
+; FNATTR: define i8* @test9
 define i8* @test9(i8* %a, i64 %n) {
   %b = getelementptr inbounds i8, i8* %a, i64 %n
   ret i8* %b
@@ -125,7 +115,6 @@ define i8* @test9(i8* %a, i64 %n) {
 declare void @llvm.assume(i1)
 ; FNATTR: define i8* @test10
 ; FIXME: missing nonnull
-; ATTRIBUTOR: define i8* @test10
 define i8* @test10(i8* %a, i64 %n) {
   %cmp = icmp ne i64 %n, 0
   call void @llvm.assume(i1 %cmp)
@@ -139,7 +128,6 @@ define i8* @test10(i8* %a, i64 %n) {
 ; }
 ; FNATTR: define i8* @test11
 ; FIXME: missing nonnull
-; ATTRIBUTOR: define i8* @test11
 define i8* @test11(i8*) local_unnamed_addr {
   %2 = icmp eq i8* %0, null
   br i1 %2, label %3, label %5
@@ -157,8 +145,6 @@ define i8* @test11(i8*) local_unnamed_addr {
 ; Simple CallSite Test
 declare void @test12_helper(i8*)
 define void @test12(i8* nonnull %a) {
-; ATTRIBUTOR: define void @test12(i8* nonnull %a)
-; ATTRIBUTOR-NEXT: tail call void @test12_helper(i8* nonnull %a)
   tail call void @test12_helper(i8* %a)
   ret void
 }
@@ -174,7 +160,6 @@ define void @test13_helper() {
   ret void
 }
 define internal void @test13(i8* %a, i8* %b, i8* %c) {
-; ATTRIBUTOR: define internal void @test13(i8* nocapture nofree nonnull readnone %a, i8* nocapture nofree readnone %b, i8* nocapture nofree readnone %c)
   ret void
 }
 
@@ -194,7 +179,6 @@ declare nonnull i8* @nonnull()
 
 define internal i32* @f1(i32* %arg) {
 ; FIXME: missing nonnull It should be nonnull @f1(i32* nonnull readonly %arg)
-; ATTRIBUTOR: define internal nonnull i32* @f1(i32* nofree readonly %arg)
 
 bb:
   %tmp = icmp eq i32* %arg, null
@@ -207,14 +191,12 @@ bb1:                                              ; preds = %bb
 
 bb4:                                              ; preds = %bb1
   %tmp5 = getelementptr inbounds i32, i32* %arg, i64 1
-; ATTRIBUTOR: %tmp5b = tail call nonnull i32* @f3(i32* nofree nonnull %tmp5)
   %tmp5b = tail call i32* @f3(i32* %tmp5)
   %tmp5c = getelementptr inbounds i32, i32* %tmp5b, i64 -1
   br label %bb9
 
 bb6:                                              ; preds = %bb1
 ; FIXME: missing nonnull. It should be @f2(i32* nonnull %arg)
-; ATTRIBUTOR: %tmp7 = tail call nonnull i32* @f2(i32* nofree %arg)
   %tmp7 = tail call i32* @f2(i32* %arg)
   ret i32* %tmp7
 
@@ -225,28 +207,23 @@ bb9:                                              ; preds = %bb4, %bb
 
 define internal i32* @f2(i32* %arg) {
 ; FIXME: missing nonnull. It should be nonnull @f2(i32* nonnull %arg)
-; ATTRIBUTOR: define internal nonnull i32* @f2(i32* nofree readonly %arg)
 bb:
 
 ; FIXME: missing nonnull. It should be @f1(i32* nonnull readonly %arg)
-; ATTRIBUTOR:   %tmp = tail call nonnull i32* @f1(i32* nofree %arg)
   %tmp = tail call i32* @f1(i32* %arg)
   ret i32* %tmp
 }
 
 define dso_local noalias i32* @f3(i32* %arg) {
 ; FIXME: missing nonnull. It should be nonnull @f3(i32* nonnull readonly %arg)
-; ATTRIBUTOR: define dso_local noalias nonnull i32* @f3(i32* nofree readonly %arg)
 bb:
 ; FIXME: missing nonnull. It should be @f1(i32* nonnull readonly %arg)
-; ATTRIBUTOR:   %tmp = call nonnull i32* @f1(i32* nofree %arg)
   %tmp = call i32* @f1(i32* %arg)
   ret i32* %tmp
 }
 
 ; TEST 15
 define void @f15(i8* %arg) {
-; ATTRIBUTOR:   tail call void @use1(i8* nonnull dereferenceable(4) %arg)
 
   tail call void @use1(i8* dereferenceable(4) %arg)
   ret void
@@ -264,7 +241,6 @@ declare void @fun3(i8*, i8*, i8*) #1
 ; We can say that %a is nonnull but %b is not.
 define void @f16(i8* %a, i8 * %b, i8 %c) {
 ; FIXME: missing nonnull on %a
-; ATTRIBUTOR: define void @f16(i8* %a, i8* %b, i8 %c)
   %cmp = icmp eq i8 %c, 0
   br i1 %cmp, label %if.then, label %if.else
 if.then:
@@ -282,7 +258,6 @@ if.else:
 ; fun1(nonnull %a)
 ; We can say that %a is nonnull
 define void @f17(i8* %a, i8 %c) {
-; ATTRIBUTOR: define void @f17(i8* nonnull %a, i8 %c)
   %cmp = icmp eq i8 %c, 0
   br i1 %cmp, label %if.then, label %if.else
 if.then:
@@ -307,7 +282,6 @@ cont:
 ; fun1(nonnull %a)
 
 define void @f18(i8* %a, i8* %b, i8 %c) {
-; ATTRIBUTOR: define void @f18(i8* nonnull %a, i8* %b, i8 %c)
   %cmp1 = icmp eq i8 %c, 0
   br i1 %cmp1, label %if.then, label %if.else
 if.then:
@@ -334,7 +308,6 @@ cont2:
 
 define void @f19(i8* %a, i8* %b, i8 %c) {
 ; FIXME: missing nonnull on %b
-; ATTRIBUTOR: define void @f19(i8* %a, i8* %b, i8 %c)
   br label %loop.header
 loop.header:
   %cmp2 = icmp eq i8 %c, 0
@@ -363,11 +336,10 @@ declare i8 @use1safecall(i8* %x) readonly nounwind ; readonly+nounwind guarantee
 ; Can't extend non-null to parent for any argument because the 2nd call is not guaranteed to execute.
 
 define void @parent1(i8* %a, i8* %b, i8* %c) {
-; BOTH-LABEL: @parent1(i8* %a, i8* %b, i8* %c)
-; BOTH-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; FNATTR-LABEL: @parent1(i8* %a, i8* %b, i8* %c)
+; FNATTR-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
 ; FNATTR-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
-; ATTRIBUTOR-NEXT:    call void @use3nonnull(i8* nonnull %b, i8* nonnull %c, i8* nonnull %a)
-; BOTH-NEXT:    ret void
+; FNATTR-NEXT:    ret void
   call void @use3(i8* %c, i8* %a, i8* %b)
   call void @use3nonnull(i8* %b, i8* %c, i8* %a)
   ret void
@@ -380,11 +352,8 @@ define void @parent2(i8* %a, i8* %b, i8* %c) {
 ; FNATTR-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
 ; FNATTR-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
 
-; ATTRIBUTOR-LABEL: @parent2(i8* nonnull %a, i8* nonnull %b, i8* nonnull %c)
-; ATTRIBUTOR-NEXT:    call void @use3nonnull(i8* nonnull %b, i8* nonnull %c, i8* nonnull %a)
-; ATTRIBUTOR-NEXT:    call void @use3(i8* nonnull %c, i8* nonnull %a, i8* nonnull %b)
 
-; BOTH-NEXT:    ret void
+; FNATTR-NEXT:    ret void
   call void @use3nonnull(i8* %b, i8* %c, i8* %a)
   call void @use3(i8* %c, i8* %a, i8* %b)
   ret void
@@ -397,11 +366,8 @@ define void @parent3(i8* %a, i8* %b, i8* %c) {
 ; FNATTR-NEXT:    call void @use1nonnull(i8* %a)
 ; FNATTR-NEXT:    call void @use3(i8* %c, i8* %b, i8* %a)
 
-; ATTRIBUTOR-LABEL: @parent3(i8* nonnull %a, i8* %b, i8* %c)
-; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
-; ATTRIBUTOR-NEXT:    call void @use3(i8* %c, i8* %b, i8* nonnull %a)
 
-; BOTH-NEXT:  ret void
+; FNATTR-NEXT:  ret void
 
   call void @use1nonnull(i8* %a)
   call void @use3(i8* %c, i8* %b, i8* %a)
@@ -416,12 +382,8 @@ define void @parent4(i8* %a, i8* %b, i8* %c) {
 ; CHECK-NEXT:    call void @use2(i8* %a, i8* %c)
 ; CHECK-NEXT:    call void @use1(i8* %b)
 
-; ATTRIBUTOR-LABEL: @parent4(i8* %a, i8* nonnull %b, i8* nonnull %c)
-; ATTRIBUTOR-NEXT:    call void @use2nonnull(i8* nonnull %c, i8* nonnull %b)
-; ATTRIBUTOR-NEXT:    call void @use2(i8* %a, i8* nonnull %c)
-; ATTRIBUTOR-NEXT:    call void @use1(i8* nonnull %b)
 
-; BOTH: ret void
+; FNATTR: ret void
 
   call void @use2nonnull(i8* %c, i8* %b)
   call void @use2(i8* %a, i8* %c)
@@ -434,14 +396,13 @@ define void @parent4(i8* %a, i8* %b, i8* %c) {
 ; because it would incorrectly propagate the wrong information to its callers.
 
 define void @parent5(i8* %a, i1 %a_is_notnull) {
-; BOTH: @parent5(i8* %a, i1 %a_is_notnull)
-; BOTH-NEXT:    br i1 %a_is_notnull, label %t, label %f
-; BOTH:       t:
+; FNATTR: @parent5(i8* %a, i1 %a_is_notnull)
+; FNATTR-NEXT:    br i1 %a_is_notnull, label %t, label %f
+; FNATTR:       t:
 ; FNATTR-NEXT:    call void @use1nonnull(i8* %a)
-; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
-; BOTH-NEXT:    ret void
-; BOTH:       f:
-; BOTH-NEXT:    ret void
+; FNATTR-NEXT:    ret void
+; FNATTR:       f:
+; FNATTR-NEXT:    ret void
 
   br i1 %a_is_notnull, label %t, label %f
 t:
@@ -456,11 +417,9 @@ f:
 
 define i8 @parent6(i8* %a, i8* %b) {
 ; FNATTR-LABEL: @parent6(i8* nonnull %a, i8* %b)
-; ATTRIBUTOR-LABEL: @parent6(i8* nonnull %a, i8* %b)
-; BOTH-NEXT:    [[C:%.*]] = load volatile i8, i8* %b
+; FNATTR-NEXT:    [[C:%.*]] = load volatile i8, i8* %b
 ; FNATTR-NEXT:    call void @use1nonnull(i8* %a)
-; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
-; BOTH-NEXT:    ret i8 [[C]]
+; FNATTR-NEXT:    ret i8 [[C]]
 
   %c = load volatile i8, i8* %b
   call void @use1nonnull(i8* %a)
@@ -475,11 +434,8 @@ define i8 @parent7(i8* %a) {
 ; FNATTR-NEXT:    call void @use1nonnull(i8* %a)
 
 
-; ATTRIBUTOR-LABEL: @parent7(i8* nonnull %a)
-; ATTRIBUTOR-NEXT:    [[RET:%.*]] = call i8 @use1safecall(i8* nonnull %a)
-; ATTRIBUTOR-NEXT:    call void @use1nonnull(i8* nonnull %a)
 
-; BOTH-NEXT: ret i8 [[RET]]
+; FNATTR-NEXT: ret i8 [[RET]]
 
   %ret = call i8 @use1safecall(i8* %a)
   call void @use1nonnull(i8* %a)
@@ -492,18 +448,16 @@ declare i32 @esfp(...)
 
 define i1 @parent8(i8* %a, i8* %bogus1, i8* %b) personality i8* bitcast (i32 (...)* @esfp to i8*){
 ; FNATTR-LABEL: @parent8(i8* nonnull %a, i8* nocapture readnone %bogus1, i8* nonnull %b)
-; ATTRIBUTOR-LABEL: @parent8(i8* nonnull %a, i8* nocapture nofree readnone %bogus1, i8* nonnull %b)
-; BOTH-NEXT:  entry:
+; FNATTR-NEXT:  entry:
 ; FNATTR-NEXT:    invoke void @use2nonnull(i8* %a, i8* %b)
-; ATTRIBUTOR-NEXT:    invoke void @use2nonnull(i8* nonnull %a, i8* nonnull %b)
-; BOTH-NEXT:    to label %cont unwind label %exc
-; BOTH:       cont:
-; BOTH-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %b, null
-; BOTH-NEXT:    ret i1 [[NULL_CHECK]]
-; BOTH:       exc:
-; BOTH-NEXT:    [[LP:%.*]] = landingpad { i8*, i32 }
-; BOTH-NEXT:    filter [0 x i8*] zeroinitializer
-; BOTH-NEXT:    unreachable
+; FNATTR-NEXT:    to label %cont unwind label %exc
+; FNATTR:       cont:
+; FNATTR-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %b, null
+; FNATTR-NEXT:    ret i1 [[NULL_CHECK]]
+; FNATTR:       exc:
+; FNATTR-NEXT:    [[LP:%.*]] = landingpad { i8*, i32 }
+; FNATTR-NEXT:    filter [0 x i8*] zeroinitializer
+; FNATTR-NEXT:    unreachable
 
 entry:
   invoke void @use2nonnull(i8* %a, i8* %b)
@@ -519,7 +473,7 @@ exc:
   unreachable
 }
 
-; BOTH: define nonnull i32* @gep1(
+; FNATTR: define nonnull i32* @gep1(
 define i32* @gep1(i32* %p) {
   %q = getelementptr inbounds i32, i32* %p, i32 1
   ret i32* %q
@@ -527,12 +481,12 @@ define i32* @gep1(i32* %p) {
 
 define i32* @gep1_no_null_opt(i32* %p) #0 {
 ; Should't be able to derive nonnull based on gep.
-; BOTH: define i32* @gep1_no_null_opt(
+; FNATTR: define i32* @gep1_no_null_opt(
   %q = getelementptr inbounds i32, i32* %p, i32 1
   ret i32* %q
 }
 
-; BOTH: define i32 addrspace(3)* @gep2(
+; FNATTR: define i32 addrspace(3)* @gep2(
 define i32 addrspace(3)* @gep2(i32 addrspace(3)* %p) {
   %q = getelementptr inbounds i32, i32 addrspace(3)* %p, i32 1
   ret i32 addrspace(3)* %q
@@ -540,12 +494,11 @@ define i32 addrspace(3)* @gep2(i32 addrspace(3)* %p) {
 
 ; FNATTR:     define i32 addrspace(3)* @as(i32 addrspace(3)* readnone returned dereferenceable(4) %p)
 ; FIXME: We should propagate dereferenceable here but *not* nonnull
-; ATTRIBUTOR: define dereferenceable_or_null(4) i32 addrspace(3)* @as(i32 addrspace(3)* nofree readnone returned dereferenceable(4) dereferenceable_or_null(4) %p)
 define i32 addrspace(3)* @as(i32 addrspace(3)* dereferenceable(4) %p) {
   ret i32 addrspace(3)* %p
 }
 
-; BOTH: define internal nonnull i32* @g2()
+; FNATTR: define internal nonnull i32* @g2()
 define internal i32* @g2() {
   ret i32* inttoptr (i64 4 to i32*)
 }
@@ -556,33 +509,28 @@ define  i32* @g1() {
 }
 
 declare void @use_i32_ptr(i32*) readnone nounwind
-; ATTRIBUTOR: define internal void @called_by_weak(i32* nocapture nonnull readnone %a)
 define internal void @called_by_weak(i32* %a) {
   call void @use_i32_ptr(i32* %a)
   ret void
 }
 
 ; Check we do not annotate the function interface of this weak function.
-; ATTRIBUTOR: define weak_odr void @weak_caller(i32* nonnull %a)
 define weak_odr void @weak_caller(i32* nonnull %a) {
   call void @called_by_weak(i32* %a)
   ret void
 }
 
 ; Expect nonnull
-; ATTRIBUTOR: define internal void @control(i32* nocapture nonnull readnone align 16 dereferenceable(8) %a)
 define internal void @control(i32* dereferenceable(4) %a) {
   call void @use_i32_ptr(i32* %a)
   ret void
 }
 ; Avoid nonnull as we do not touch naked functions
-; ATTRIBUTOR: define internal void @naked(i32* dereferenceable(4) %a)
 define internal void @naked(i32* dereferenceable(4) %a) naked {
   call void @use_i32_ptr(i32* %a)
   ret void
 }
 ; Avoid nonnull as we do not touch optnone
-; ATTRIBUTOR: define internal void @optnone(i32* dereferenceable(4) %a)
 define internal void @optnone(i32* dereferenceable(4) %a) optnone noinline {
   call void @use_i32_ptr(i32* %a)
   ret void
@@ -618,20 +566,6 @@ define i32 @nonnull_exec_ctx_1(i32* %a, i32 %b) {
 ; FNATTR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; FNATTR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
-; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1
-; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(i32* [[A]])
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -667,22 +601,6 @@ define i32 @nonnull_exec_ctx_1b(i32* %a, i32 %b) {
 ; FNATTR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; FNATTR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
-; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_1b
-; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(i32* [[A]])
-; ATTRIBUTOR-NEXT:    br label [[HD2]]
-; ATTRIBUTOR:       hd2:
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -719,20 +637,6 @@ define i32 @nonnull_exec_ctx_2(i32* %a, i32 %b) willreturn nounwind {
 ; FNATTR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; FNATTR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
-; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2
-; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(i32* nonnull [[A]])
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -768,22 +672,6 @@ define i32 @nonnull_exec_ctx_2b(i32* %a, i32 %b) willreturn nounwind {
 ; FNATTR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
 ; FNATTR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
-; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_exec_ctx_2b
-; ATTRIBUTOR-SAME: (i32* [[A:%.*]], i32 [[B:%.*]])
-; ATTRIBUTOR-NEXT:  en:
-; ATTRIBUTOR-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[B:%.*]], 0
-; ATTRIBUTOR-NEXT:    br i1 [[TMP3]], label [[EX:%.*]], label [[HD:%.*]]
-; ATTRIBUTOR:       ex:
-; ATTRIBUTOR-NEXT:    [[TMP5:%.*]] = tail call i32 @g(i32* nonnull [[A:%.*]])
-; ATTRIBUTOR-NEXT:    ret i32 [[TMP5]]
-; ATTRIBUTOR:       hd:
-; ATTRIBUTOR-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP8:%.*]], [[HD2:%.*]] ], [ 0, [[EN:%.*]] ]
-; ATTRIBUTOR-NEXT:    tail call void @h(i32* nonnull [[A]])
-; ATTRIBUTOR-NEXT:    br label [[HD2]]
-; ATTRIBUTOR:       hd2:
-; ATTRIBUTOR-NEXT:    [[TMP8]] = add nuw i32 [[TMP7]], 1
-; ATTRIBUTOR-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], [[B]]
-; ATTRIBUTOR-NEXT:    br i1 [[TMP9]], label [[EX]], label [[HD]]
 ;
 en:
   %tmp3 = icmp eq i32 %b, 0
@@ -809,21 +697,21 @@ declare void @sink(i32*)
 
 ; FIXME: the sink argument should be marked nonnull as in @PR43833_simple.
 define void @PR43833(i32* %0, i32 %1) {
-; BOTH-LABEL: @PR43833(
-; BOTH-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1:%.*]], 1
-; BOTH-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
-; BOTH:       4:
-; BOTH-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
-; BOTH-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
-; BOTH-NEXT:    br label [[TMP8:%.*]]
-; BOTH:       7:
-; BOTH-NEXT:    ret void
-; BOTH:       8:
-; BOTH-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
-; BOTH-NEXT:    tail call void @sink(i32* [[TMP6]])
-; BOTH-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
-; BOTH-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
-; BOTH-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
+; FNATTR-LABEL: @PR43833(
+; FNATTR-NEXT:    [[TMP3:%.*]] = icmp sgt i32 [[TMP1:%.*]], 1
+; FNATTR-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; FNATTR:       4:
+; FNATTR-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; FNATTR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
+; FNATTR-NEXT:    br label [[TMP8:%.*]]
+; FNATTR:       7:
+; FNATTR-NEXT:    ret void
+; FNATTR:       8:
+; FNATTR-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
+; FNATTR-NEXT:    tail call void @sink(i32* [[TMP6]])
+; FNATTR-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
+; FNATTR-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
+; FNATTR-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
 ;
   %3 = icmp sgt i32 %1, 1
   br i1 %3, label %4, label %7
@@ -846,37 +734,22 @@ define void @PR43833(i32* %0, i32 %1) {
 
 ; Adjusted from PR43833
 define void @PR43833_simple(i32* %0, i32 %1) {
-; OLD-LABEL: @PR43833_simple(
-; OLD-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
-; OLD-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
-; OLD:       4:
-; OLD-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
-; OLD-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
-; OLD-NEXT:    br label [[TMP8:%.*]]
-; OLD:       7:
-; OLD-NEXT:    ret void
-; OLD:       8:
-; OLD-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
-; OLD-NEXT:    tail call void @sink(i32* [[TMP6]])
-; OLD-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
-; OLD-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
-; OLD-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
+; FNATTR-LABEL: @PR43833_simple(
+; FNATTR-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; FNATTR-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
+; FNATTR:       4:
+; FNATTR-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
+; FNATTR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
+; FNATTR-NEXT:    br label [[TMP8:%.*]]
+; FNATTR:       7:
+; FNATTR-NEXT:    ret void
+; FNATTR:       8:
+; FNATTR-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
+; FNATTR-NEXT:    tail call void @sink(i32* [[TMP6]])
+; FNATTR-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
+; FNATTR-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
+; FNATTR-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
 ;
-; ATTRIBUTOR_NPM-LABEL: @PR43833_simple(
-; ATTRIBUTOR_NPM-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
-; ATTRIBUTOR_NPM-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP7:%.*]]
-; ATTRIBUTOR_NPM:       4:
-; ATTRIBUTOR_NPM-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP1]] to i64
-; ATTRIBUTOR_NPM-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 [[TMP5]]
-; ATTRIBUTOR_NPM-NEXT:    br label [[TMP8:%.*]]
-; ATTRIBUTOR_NPM:       7:
-; ATTRIBUTOR_NPM-NEXT:    ret void
-; ATTRIBUTOR_NPM:       8:
-; ATTRIBUTOR_NPM-NEXT:    [[TMP9:%.*]] = phi i32 [ 1, [[TMP4]] ], [ [[TMP10:%.*]], [[TMP8]] ]
-; ATTRIBUTOR_NPM-NEXT:    tail call void @sink(i32* [[TMP6]])
-; ATTRIBUTOR_NPM-NEXT:    [[TMP10]] = add nuw nsw i32 [[TMP9]], 1
-; ATTRIBUTOR_NPM-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], [[TMP1]]
-; ATTRIBUTOR_NPM-NEXT:    br i1 [[TMP11]], label [[TMP7]], label [[TMP8]]
 ;
   %3 = icmp ne i32 %1, 0
   br i1 %3, label %4, label %7
diff --git a/llvm/test/Transforms/FunctionAttrs/norecurse.ll b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
index de5d35d634296..b780bb7b7189b 100644
--- a/llvm/test/Transforms/FunctionAttrs/norecurse.ll
+++ b/llvm/test/Transforms/FunctionAttrs/norecurse.ll
@@ -1,88 +1,83 @@
-; RUN: opt < %s -basicaa -functionattrs -rpo-functionattrs -S | FileCheck %s --check-prefixes=CHECK,BOTH
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs),rpo-functionattrs' -S | FileCheck %s --check-prefixes=CHECK,BOTH
-; RUN: opt -passes=attributor --attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=2 -S < %s | FileCheck %s --check-prefixes=ATTRIBUTOR,BOTH
+; RUN: opt < %s -basicaa -functionattrs -rpo-functionattrs -S | FileCheck %s 
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs),rpo-functionattrs' -S | FileCheck %s
 
 ; CHECK: Function Attrs
 ; CHECK-SAME: norecurse nounwind readnone
-; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; BOTH-NEXT: define i32 @leaf()
+; CHECK-NEXT: define i32 @leaf()
 define i32 @leaf() {
   ret i32 1
 }
 
-; BOTH: Function Attrs
-; BOTH-SAME: readnone
-; BOTH-NOT: norecurse
-; BOTH-NEXT: define i32 @self_rec()
+; CHECK: Function Attrs
+; CHECK-SAME: readnone
+; CHECK-NOT: norecurse
+; CHECK-NEXT: define i32 @self_rec()
 define i32 @self_rec() {
   %a = call i32 @self_rec()
   ret i32 4
 }
 
-; BOTH: Function Attrs
-; BOTH-SAME: readnone
-; BOTH-NOT: norecurse
-; BOTH-NEXT: define i32 @indirect_rec()
+; CHECK: Function Attrs
+; CHECK-SAME: readnone
+; CHECK-NOT: norecurse
+; CHECK-NEXT: define i32 @indirect_rec()
 define i32 @indirect_rec() {
   %a = call i32 @indirect_rec2()
   ret i32 %a
 }
-; BOTH: Function Attrs
-; BOTH-SAME: readnone
-; BOTH-NOT: norecurse
-; BOTH-NEXT: define i32 @indirect_rec2()
+; CHECK: Function Attrs
+; CHECK-SAME: readnone
+; CHECK-NOT: norecurse
+; CHECK-NEXT: define i32 @indirect_rec2()
 define i32 @indirect_rec2() {
   %a = call i32 @indirect_rec()
   ret i32 %a
 }
 
-; BOTH: Function Attrs
-; BOTH-SAME: readnone
-; BOTH-NOT: norecurse
-; BOTH-NEXT: define i32 @extern()
+; CHECK: Function Attrs
+; CHECK-SAME: readnone
+; CHECK-NOT: norecurse
+; CHECK-NEXT: define i32 @extern()
 define i32 @extern() {
   %a = call i32 @k()
   ret i32 %a
 }
 
-; BOTH: Function Attrs
-; BOTH-NEXT: declare i32 @k()
+; CHECK: Function Attrs
+; CHECK-NEXT: declare i32 @k()
 declare i32 @k() readnone
 
-; BOTH: Function Attrs
+; CHECK: Function Attrs
 ; CHECK-SAME: nounwind
-; BOTH-NOT: norecurse
+; CHECK-NOT: norecurse
 ; CHECK-NEXT: define void @intrinsic(i8* nocapture %dest, i8* nocapture readonly %src, i32 %len)
-; ATTRIBUTOR-NEXT: define void @intrinsic(i8* nocapture writeonly %dest, i8* nocapture readonly %src, i32 %len)
 define void @intrinsic(i8* %dest, i8* %src, i32 %len) {
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %src, i32 %len, i1 false)
   ret void
 }
 
-; BOTH: Function Attrs
-; BOTH-NEXT: declare void @llvm.memcpy.p0i8.p0i8.i32
+; CHECK: Function Attrs
+; CHECK-NEXT: declare void @llvm.memcpy.p0i8.p0i8.i32
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i1)
 
-; BOTH: Function Attrs
+; CHECK: Function Attrs
 ; CHECK-SAME: norecurse readnone
 ; FIXME: missing "norecurse"
-; ATTRIBUTOR-SAME: nosync readnone
 ; CHECK-NEXT: define internal i32 @called_by_norecurse()
 define internal i32 @called_by_norecurse() {
   %a = call i32 @k()
   ret i32 %a
 }
-; BOTH: Function Attrs
-; BOTH-NEXT: define void @m()
+; CHECK: Function Attrs
+; CHECK-NEXT: define void @m()
 define void @m() norecurse {
   %a = call i32 @called_by_norecurse()
   ret void
 }
 
-; BOTH: Function Attrs
+; CHECK: Function Attrs
 ; CHECK-SAME: norecurse readnone
 ; FIXME: missing "norecurse"
-; ATTRIBUTOR-SAME: nosync
 ; CHECK-NEXT: define internal i32 @called_by_norecurse_indirectly()
 define internal i32 @called_by_norecurse_indirectly() {
   %a = call i32 @k()
@@ -96,61 +91,3 @@ define void @p() norecurse {
   call void @o()
   ret void
 }
-
-; ATTRIBUTOR: Function Attrs: nofree nosync nounwind
-; ATTRIBUTOR-NEXT: define void @f(i32 %x)
-define void @f(i32 %x)  {
-entry:
-  %x.addr = alloca i32, align 4
-  store i32 %x, i32* %x.addr, align 4
-  %0 = load i32, i32* %x.addr, align 4
-  %tobool = icmp ne i32 %0, 0
-  br i1 %tobool, label %if.then, label %if.end
-
-if.then:
-  call void @g() norecurse
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-; BOTH: define void @g()
-define void @g() norecurse {
-entry:
-  call void @f(i32 0)
-  ret void
-}
-
-; ATTRIBUTOR-NOT: Function Attrs
-; ATTRIBUTOR: define linkonce_odr i32 @leaf_redefinable()
-define linkonce_odr i32 @leaf_redefinable() {
-  ret i32 1
-}
-
-; Call through a function pointer
-; ATTRIBUTOR-NOT: Function Attrs
-; ATTRIBUTOR: define i32 @eval_func1(i32 (i32)* nocapture nofree nonnull %0, i32 %1)
-define i32 @eval_func1(i32 (i32)* , i32) local_unnamed_addr {
-  %3 = tail call i32 %0(i32 %1) #2
-  ret i32 %3
-}
-
-; ATTRIBUTOR-NOT: Function Attrs
-; ATTRIBUTOR: define i32 @eval_func2(i32 (i32)* nocapture nofree %0, i32 %1)
-define i32 @eval_func2(i32 (i32)* , i32) local_unnamed_addr "null-pointer-is-valid"="true"{
-  %3 = tail call i32 %0(i32 %1) #2
-  ret i32 %3
-}
-
-declare void @unknown()
-; Call an unknown function in a dead block.
-; ATTRIBUTOR: Function Attrs: nofree norecurse nosync nounwind readnone willreturn
-; ATTRIBUTOR: define i32 @call_unknown_in_dead_block()
-define i32 @call_unknown_in_dead_block() local_unnamed_addr {
-  ret i32 0
-Dead:
-  tail call void @unknown()
-  ret i32 1
-}
-
diff --git a/llvm/test/Transforms/FunctionAttrs/nounwind.ll b/llvm/test/Transforms/FunctionAttrs/nounwind.ll
index ed7576c970bfa..6d5e3a2ea5b24 100644
--- a/llvm/test/Transforms/FunctionAttrs/nounwind.ll
+++ b/llvm/test/Transforms/FunctionAttrs/nounwind.ll
@@ -1,11 +1,8 @@
 ; RUN: opt < %s -functionattrs -S | FileCheck %s
-; RUN: opt < %s -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-annotate-decl-cs -attributor-max-iterations=3 -S | FileCheck %s --check-prefix=ATTRIBUTOR
 
 ; TEST 1
 ; CHECK: Function Attrs: norecurse nounwind readnone
 ; CHECK-NEXT: define i32 @foo1()
-; ATTRIBUTOR: Function Attrs: nofree nosync nounwind
-; ATTRIBUTOR-NEXT: define i32 @foo1()
 define i32 @foo1() {
   ret i32 1
 }
@@ -13,8 +10,6 @@ define i32 @foo1() {
 ; TEST 2
 ; CHECK: Function Attrs: nounwind readnone
 ; CHECK-NEXT: define i32 @scc1_foo()
-; ATTRIBUTOR: Function Attrs: nofree noreturn nosync nounwind
-; ATTRIBUTOR-NEXT: define i32 @scc1_foo()
 define i32 @scc1_foo() {
   %1 = call i32 @scc1_bar()
   ret i32 1
@@ -24,8 +19,6 @@ define i32 @scc1_foo() {
 ; TEST 3
 ; CHECK: Function Attrs: nounwind readnone
 ; CHECK-NEXT: define i32 @scc1_bar()
-; ATTRIBUTOR: Function Attrs: nofree noreturn nosync nounwind
-; ATTRIBUTOR-NEXT: define i32 @scc1_bar()
 define i32 @scc1_bar() {
   %1 = call i32 @scc1_foo()
   ret i32 1
@@ -36,7 +29,6 @@ declare i32 @non_nounwind()
 
 ; TEST 4
 ; CHECK: define void @call_non_nounwind() {
-; ATTRIBUTOR: define void @call_non_nounwind() {
 define void @call_non_nounwind(){
     tail call i32 @non_nounwind()
     ret void
@@ -51,7 +43,6 @@ define void @call_non_nounwind(){
 ; }
 
 ; CHECK: define i32 @maybe_throw(i1 zeroext %0)
-; ATTRIBUTOR: define i32 @maybe_throw(i1 zeroext %0)
 define i32 @maybe_throw(i1 zeroext %0) {
   br i1 %0, label %2, label %3
 
@@ -75,7 +66,6 @@ declare void @__cxa_rethrow()
 ; }
 
 ; CHECK: define i32 @catch_thing()
-; ATTRIBUTOR: define i32 @catch_thing()
 define i32 @catch_thing() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
   invoke void @__cxa_rethrow() #1
           to label %1 unwind label %2
@@ -93,9 +83,6 @@ define i32 @catch_thing() personality i8* bitcast (i32 (...)* @__gxx_personality
 }
 
 define i32 @catch_thing_user() {
-; ATTRIBUTOR:     define i32 @catch_thing_user
-; ATTRIBUTOR-NEXT: %catch_thing_call = call
-; ATTRIBUTOR-NEXT: ret i32 -1
   %catch_thing_call = call i32 @catch_thing()
   ret i32 %catch_thing_call
 }
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index dc1fbf652f9ee..b11b3edcebfc7 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -functionattrs -S | FileCheck %s --check-prefixes=CHECK,FNATTR
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s --check-prefixes=CHECK,FNATTR
-; RUN: opt < %s -attributor -attributor-disable=false -S -attributor-annotate-decl-cs | FileCheck %s --check-prefixes=CHECK,ATTRIBUTOR
-; RUN: opt < %s -aa-pipeline=basic-aa -passes='attributor' -attributor-disable=false -S -attributor-annotate-decl-cs | FileCheck %s --check-prefixes=CHECK,ATTRIBUTOR
+; RUN: opt < %s -functionattrs -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s
 
 @x = global i32 0
 
@@ -9,23 +7,20 @@ declare void @test1_1(i8* %x1_1, i8* readonly %y1_1, ...)
 
 ; NOTE: readonly for %y1_2 would be OK here but not for the similar situation in test13.
 ;
-; FNATTR: define void @test1_2(i8* %x1_2, i8* readonly %y1_2, i8* %z1_2)
-; ATTRIBUTOR: define void @test1_2(i8* %x1_2, i8* %y1_2, i8* %z1_2)
+; CHECK: define void @test1_2(i8* %x1_2, i8* readonly %y1_2, i8* %z1_2)
 define void @test1_2(i8* %x1_2, i8* %y1_2, i8* %z1_2) {
   call void (i8*, i8*, ...) @test1_1(i8* %x1_2, i8* %y1_2, i8* %z1_2)
   store i32 0, i32* @x
   ret void
 }
 
-; FNATTR: define i8* @test2(i8* readnone returned %p)
-; ATTRIBUTOR: define i8* @test2(i8* nofree readnone returned %p)
+; CHECK: define i8* @test2(i8* readnone returned %p)
 define i8* @test2(i8* %p) {
   store i32 0, i32* @x
   ret i8* %p
 }
 
-; FNATTR: define i1 @test3(i8* readnone %p, i8* readnone %q)
-; ATTRIBUTOR: define i1 @test3(i8* nofree readnone %p, i8* nofree readnone %q)
+; CHECK: define i1 @test3(i8* readnone %p, i8* readnone %q)
 define i1 @test3(i8* %p, i8* %q) {
   %A = icmp ult i8* %p, %q
   ret i1 %A
@@ -39,8 +34,7 @@ define void @test4_2(i8* %p) {
   ret void
 }
 
-; FNATTR: define void @test5(i8** nocapture %p, i8* %q)
-; ATTRIBUTOR: define void @test5(i8** nocapture nofree nonnull writeonly dereferenceable(8) %p, i8* nofree writeonly %q)
+; CHECK: define void @test5(i8** nocapture %p, i8* %q)
 ; Missed optz'n: we could make %q readnone, but don't break test6!
 define void @test5(i8** %p, i8* %q) {
   store i8* %q, i8** %p
@@ -48,8 +42,7 @@ define void @test5(i8** %p, i8* %q) {
 }
 
 declare void @test6_1()
-; FNATTR: define void @test6_2(i8** nocapture %p, i8* %q)
-; ATTRIBUTOR: define void @test6_2(i8** nocapture nonnull writeonly dereferenceable(8) %p, i8* %q)
+; CHECK: define void @test6_2(i8** nocapture %p, i8* %q)
 ; This is not a missed optz'n.
 define void @test6_2(i8** %p, i8* %q) {
   store i8* %q, i8** %p
@@ -57,22 +50,19 @@ define void @test6_2(i8** %p, i8* %q) {
   ret void
 }
 
-; FNATTR: define void @test7_1(i32* inalloca nocapture %a)
-; ATTRIBUTOR: define void @test7_1(i32* inalloca nocapture nofree writeonly %a)
+; CHECK: define void @test7_1(i32* inalloca nocapture %a)
 ; inalloca parameters are always considered written
 define void @test7_1(i32* inalloca %a) {
   ret void
 }
 
-; FNATTR: define i32* @test8_1(i32* readnone returned %p)
-; ATTRIBUTOR: define i32* @test8_1(i32* nofree readnone returned %p)
+; CHECK: define i32* @test8_1(i32* readnone returned %p)
 define i32* @test8_1(i32* %p) {
 entry:
   ret i32* %p
 }
 
-; FNATTR: define void @test8_2(i32* %p)
-; ATTRIBUTOR: define void @test8_2(i32* nocapture nofree writeonly %p)
+; CHECK: define void @test8_2(i32* %p)
 define void @test8_2(i32* %p) {
 entry:
   %call = call i32* @test8_1(i32* %p)
@@ -136,11 +126,9 @@ declare void @escape_readonly_ptr(i8** %addr, i8* readonly %ptr)
 ;
 ; FIXME: This test currently exposes a bug in functionattrs!
 ;
-; FNATTR: define void @unsound_readnone(i8* nocapture readnone %ignored, i8* readnone %escaped_then_written)
-; FNATTR: define void @unsound_readonly(i8* nocapture readnone %ignored, i8* readonly %escaped_then_written)
+; CHECK: define void @unsound_readnone(i8* nocapture readnone %ignored, i8* readnone %escaped_then_written)
+; CHECK: define void @unsound_readonly(i8* nocapture readnone %ignored, i8* readonly %escaped_then_written)
 ;
-; ATTRIBUTOR: define void @unsound_readnone(i8* nocapture nofree readnone %ignored, i8* %escaped_then_written)
-; ATTRIBUTOR: define void @unsound_readonly(i8* nocapture nofree readnone %ignored, i8* %escaped_then_written)
 define void @unsound_readnone(i8* %ignored, i8* %escaped_then_written) {
   %addr = alloca i8*
   call void @escape_readnone_ptr(i8** %addr, i8* %escaped_then_written)
diff --git a/llvm/test/Transforms/GlobalOpt/atomic.ll b/llvm/test/Transforms/GlobalOpt/atomic.ll
index 7597e0f03ba3e..f49ab52701b5c 100644
--- a/llvm/test/Transforms/GlobalOpt/atomic.ll
+++ b/llvm/test/Transforms/GlobalOpt/atomic.ll
@@ -1,10 +1,10 @@
 ; RUN: opt -globalopt < %s -S -o - | FileCheck %s
 
-@GV1 = internal global i64 1
-@GV2 = internal global i32 0
+@GV1 = internal global i64 1, align 8
+@GV2 = internal global i32 0, align 4
 
-; CHECK: @GV1 = internal unnamed_addr global i64 1
-; CHECK: @GV2 = internal unnamed_addr global i32 0
+; CHECK: @GV1 = internal unnamed_addr global i64 1, align 8
+; CHECK: @GV2 = internal unnamed_addr global i32 0, align 4
 
 define void @test1() {
 entry:
diff --git a/llvm/test/Transforms/HotColdSplit/retain-section.ll b/llvm/test/Transforms/HotColdSplit/retain-section.ll
new file mode 100644
index 0000000000000..af611a7585861
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/retain-section.ll
@@ -0,0 +1,28 @@
+; RUN: opt -hotcoldsplit -hotcoldsplit-threshold=0 -S < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.14.0"
+
+; Retain section after cold code is split off.
+
+; CHECK-LABEL: @fun
+; CHECK: call void @fun.cold.1{{.*}}
+define void @fun() section ".text.cold" {
+entry:
+  br i1 undef, label %if.then, label %if.else
+
+if.then:
+  ret void
+
+if.else:
+  call void @sink()
+  ret void
+}
+
+define void @foo() cold {
+  ret void
+}
+
+declare void @sink() cold
+
+; CHECK: define {{.*}} @fun.cold.1{{.*}} section ".text.cold"
diff --git a/llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll b/llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll
index a824c7995234d..e707b88b54ac4 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/dereferenceable.ll
@@ -1,5 +1,4 @@
 ; RUN: opt < %s -inferattrs -S | FileCheck %s
-; RUN: opt < %s -attributor --attributor-disable=false -S | FileCheck %s --check-prefix=ATTRIBUTOR
 
 
 
@@ -8,7 +7,6 @@
 
 define <4 x double> @PR21780(double* %ptr) {
 ; CHECK-LABEL: @PR21780(double* %ptr)
-; ATTRIBUTOR-LABEL: @PR21780(double* nocapture nofree nonnull readonly align 8 dereferenceable(32) %ptr)
 
   ; GEP of index 0 is simplified away.
   %arrayidx1 = getelementptr inbounds double, double* %ptr, i64 1
@@ -31,8 +29,6 @@ define <4 x double> @PR21780(double* %ptr) {
 
 define double @PR21780_only_access3_with_inbounds(double* %ptr) {
 ; CHECK-LABEL: @PR21780_only_access3_with_inbounds(double* %ptr)
-; FIXME: %ptr should have align 8
-; ATTRIBUTOR-LABEL: @PR21780_only_access3_with_inbounds(double* nocapture nofree nonnull readonly dereferenceable(32) %ptr)
 
   %arrayidx3 = getelementptr inbounds double, double* %ptr, i64 3
   %t3 = load double, double* %arrayidx3, align 8
@@ -41,8 +37,6 @@ define double @PR21780_only_access3_with_inbounds(double* %ptr) {
 
 define double @PR21780_only_access3_without_inbounds(double* %ptr) {
 ; CHECK-LABEL: @PR21780_only_access3_without_inbounds(double* %ptr)
-; FIXME: %ptr should have align 8
-; ATTRIBUTOR-LABEL: @PR21780_only_access3_without_inbounds(double* nocapture nofree readonly %ptr)
   %arrayidx3 = getelementptr double, double* %ptr, i64 3
   %t3 = load double, double* %arrayidx3, align 8
   ret double %t3
@@ -50,8 +44,6 @@ define double @PR21780_only_access3_without_inbounds(double* %ptr) {
 
 define double @PR21780_without_inbounds(double* %ptr) {
 ; CHECK-LABEL: @PR21780_without_inbounds(double* %ptr)
-; FIXME: this should be @PR21780_without_inbounds(double* nonnull dereferenceable(32) %ptr)
-; ATTRIBUTOR-LABEL: @PR21780_without_inbounds(double* nocapture nofree nonnull readonly align 8 dereferenceable(8) %ptr)
 
   %arrayidx1 = getelementptr double, double* %ptr, i64 1
   %arrayidx2 = getelementptr double, double* %ptr, i64 2
@@ -163,6 +155,19 @@ define void @volatile_is_not_dereferenceable(i16* %ptr) {
   ret void
 }
 
+; TODO: We should allow inference for atomic (but not volatile) ops.
+
+define void @atomic_is_alright(i16* %ptr) {
+; CHECK-LABEL: @atomic_is_alright(i16* %ptr)
+  %arrayidx0 = getelementptr i16, i16* %ptr, i64 0
+  %arrayidx1 = getelementptr i16, i16* %ptr, i64 1
+  %arrayidx2 = getelementptr i16, i16* %ptr, i64 2
+  %t0 = load atomic i16, i16* %arrayidx0 unordered, align 2
+  %t1 = load i16, i16* %arrayidx1
+  %t2 = load i16, i16* %arrayidx2
+  ret void
+}
+
 declare void @may_not_return()
 
 define void @not_guaranteed_to_transfer_execution(i16* %ptr) {
@@ -193,6 +198,7 @@ define void @variable_gep_index(i8* %unused, i8* %ptr, i64 %variable_index) {
 
 define void @multi_index_gep(<4 x i8>* %ptr) {
 ; CHECK-LABEL: @multi_index_gep(<4 x i8>* %ptr)
+; FIXME: %ptr should be dereferenceable(4)
   %arrayidx00 = getelementptr <4 x i8>, <4 x i8>* %ptr, i64 0, i64 0
   %t0 = load i8, i8* %arrayidx00
   ret void
@@ -246,6 +252,21 @@ define void @more_bytes(i32* dereferenceable(8) %ptr) {
   ret void
 }
 
+; Improve on existing dereferenceable_or_null attribute.
+
+define void @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr) {
+; CHECK-LABEL: @more_bytes_and_not_null(i32* dereferenceable_or_null(8) %ptr)
+  %arrayidx3 = getelementptr i32, i32* %ptr, i64 3
+  %arrayidx1 = getelementptr i32, i32* %ptr, i64 1
+  %arrayidx0 = getelementptr i32, i32* %ptr, i64 0
+  %arrayidx2 = getelementptr i32, i32* %ptr, i64 2
+  %t3 = load i32, i32* %arrayidx3
+  %t1 = load i32, i32* %arrayidx1
+  %t2 = load i32, i32* %arrayidx2
+  %t0 = load i32, i32* %arrayidx0
+  ret void
+}
+
 ; But don't pessimize existing dereferenceable attribute.
 
 define void @better_bytes(i32* dereferenceable(100) %ptr) {
@@ -318,3 +339,19 @@ define void @load_store(i32* %arg) {
   store float 2.0, float* %arrayidx1
   ret void
 }
+
+define void @different_size1(i32* %arg) {
+; CHECK-LABEL: @different_size1(i32* %arg)
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  store i32 0, i32* %arg
+  ret void
+}
+
+define void @different_size2(i32* %arg) {
+; CHECK-LABEL: @different_size2(i32* %arg)
+  store i32 0, i32* %arg
+  %arg-cast = bitcast i32* %arg to double*
+  store double 0.000000e+00, double* %arg-cast
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/inline-indirect-chain.ll b/llvm/test/Transforms/Inline/inline-indirect-chain.ll
new file mode 100644
index 0000000000000..bf73ad35dade7
--- /dev/null
+++ b/llvm/test/Transforms/Inline/inline-indirect-chain.ll
@@ -0,0 +1,55 @@
+; RUN: opt -inline -early-cse < %s
+; This test used to crash (PR35469).
+
+define void @func1() {
+  %t = bitcast void ()* @func2 to void ()*
+  tail call void %t()
+  ret void
+}
+
+define void @func2() {
+  %t = bitcast void ()* @func3 to void ()*
+  tail call void %t()
+  ret void
+}
+
+define void @func3() {
+  %t = bitcast void ()* @func4 to void ()*
+  tail call void %t()
+  ret void
+}
+
+define void @func4() {
+  br i1 undef, label %left, label %right
+
+left:
+  %t = bitcast void ()* @func5 to void ()*
+  tail call void %t()
+  ret void
+
+right:
+  ret void
+}
+
+define void @func5() {
+  %t = bitcast void ()* @func6 to void ()*
+  tail call void %t()
+  ret void
+}
+
+define void @func6() {
+  %t = bitcast void ()* @func2 to void ()*
+  tail call void %t()
+  ret void
+}
+
+define void @func7() {
+  %t = bitcast void ()* @func3 to void ()*
+  tail call void @func8(void()* %t)
+  ret void
+}
+
+define void @func8(void()* %f) {
+  tail call void %f()
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
index 4594102a46830..7c511135ba151 100644
--- a/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/mve-v2i2v.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S -o - %s | FileCheck %s
 
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
 declare i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1>)
 declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
 declare i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1>)
@@ -234,3 +236,95 @@ entry:
   %vout = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %wide2)
   ret <4 x i1> %vout
 }
+
+; If a predicate vector is round-tripped to an integer and back, and
+; complemented while it's in integer form, we should collapse that to
+; a complement of the vector itself. (Rationale: this is likely to
+; allow it to be code-generated as MVE VPNOT.)
+
+define <4 x i1> @vpnot_4(<4 x i1> %vin) {
+; CHECK-LABEL: @vpnot_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <4 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <4 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %vin)
+  %flipped = xor i32 %int, 65535
+  %vout = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %flipped)
+  ret <4 x i1> %vout
+}
+
+define <8 x i1> @vpnot_8(<8 x i1> %vin) {
+; CHECK-LABEL: @vpnot_8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <8 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <8 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %vin)
+  %flipped = xor i32 %int, 65535
+  %vout = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %flipped)
+  ret <8 x i1> %vout
+}
+
+define <16 x i1> @vpnot_16(<16 x i1> %vin) {
+; CHECK-LABEL: @vpnot_16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <16 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <16 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %vin)
+  %flipped = xor i32 %int, 65535
+  %vout = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %flipped)
+  ret <16 x i1> %vout
+}
+
+; And this still works even if the i32 is narrowed to i16 and back on
+; opposite sides of the xor.
+
+define <4 x i1> @vpnot_narrow_4(<4 x i1> %vin) {
+; CHECK-LABEL: @vpnot_narrow_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <4 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <4 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v4i1(<4 x i1> %vin)
+  %narrow = trunc i32 %int to i16
+  %flipped_narrow = xor i16 %narrow, -1
+  %flipped = zext i16 %flipped_narrow to i32
+  %vout = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %flipped)
+  ret <4 x i1> %vout
+}
+
+define <8 x i1> @vpnot_narrow_8(<8 x i1> %vin) {
+; CHECK-LABEL: @vpnot_narrow_8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <8 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <8 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %vin)
+  %narrow = trunc i32 %int to i16
+  %flipped_narrow = xor i16 %narrow, -1
+  %flipped = zext i16 %flipped_narrow to i32
+  %vout = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %flipped)
+  ret <8 x i1> %vout
+}
+
+define <16 x i1> @vpnot_narrow_16(<16 x i1> %vin) {
+; CHECK-LABEL: @vpnot_narrow_16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VOUT:%.*]] = xor <16 x i1> [[VIN:%.*]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    ret <16 x i1> [[VOUT]]
+;
+entry:
+  %int = call i32 @llvm.arm.mve.pred.v2i.v16i1(<16 x i1> %vin)
+  %narrow = trunc i32 %int to i16
+  %flipped_narrow = xor i16 %narrow, -1
+  %flipped = zext i16 %flipped_narrow to i32
+  %vout = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %flipped)
+  ret <16 x i1> %vout
+}
diff --git a/llvm/test/CodeGen/X86/2009-03-23-i80-fp80.ll b/llvm/test/Transforms/InstCombine/X86/2009-03-23-i80-fp80.ll
similarity index 58%
rename from llvm/test/CodeGen/X86/2009-03-23-i80-fp80.ll
rename to llvm/test/Transforms/InstCombine/X86/2009-03-23-i80-fp80.ll
index e542325b63697..f093c435a879e 100644
--- a/llvm/test/CodeGen/X86/2009-03-23-i80-fp80.ll
+++ b/llvm/test/Transforms/InstCombine/X86/2009-03-23-i80-fp80.ll
@@ -1,14 +1,20 @@
-; RUN: opt < %s -instcombine -S | grep 302245289961712575840256
-; RUN: opt < %s -instcombine -S | grep K40018000000000000000
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9"
 
 define i80 @from() {
+; CHECK-LABEL: @from(
+; CHECK-NEXT:    ret i80 302245289961712575840256
+;
   %tmp = bitcast x86_fp80 0xK4000C000000000000000 to i80
   ret i80 %tmp
 }
 
 define x86_fp80 @to() {
+; CHECK-LABEL: @to(
+; CHECK-NEXT:    ret x86_fp80 0xK40018000000000000000
+;
   %tmp = bitcast i80 302259125019767858003968 to x86_fp80
   ret x86_fp80 %tmp
 }
diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 8bbc833d5d445..2b23953497b09 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -133,77 +133,77 @@ define i32 @abs_canonical_5(i8 %x) {
 
 define i32 @abs_canonical_6(i32 %a, i32 %b) {
 ; CHECK-LABEL: @abs_canonical_6(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[T1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[T1]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp sgt i32 %tmp1, -1
-  %tmp2 = sub i32 %b, %a
-  %abs = select i1 %cmp, i32 %tmp1, i32 %tmp2
+  %t1 = sub i32 %a, %b
+  %cmp = icmp sgt i32 %t1, -1
+  %t2 = sub i32 %b, %a
+  %abs = select i1 %cmp, i32 %t1, i32 %t2
   ret i32 %abs
 }
 
 define <2 x i8> @abs_canonical_7(<2 x i8> %a, <2 x i8 > %b) {
 ; CHECK-LABEL: @abs_canonical_7(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[TMP2]], <2 x i8> [[TMP1]]
+; CHECK-NEXT:    [[T1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[T1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[TMP1]], <2 x i8> [[T1]]
 ; CHECK-NEXT:    ret <2 x i8> [[ABS]]
 ;
 
-  %tmp1 = sub <2 x i8> %a, %b
-  %cmp = icmp sgt <2 x i8> %tmp1, <i8 -1, i8 -1>
-  %tmp2 = sub <2 x i8> %b, %a
-  %abs = select <2 x i1> %cmp, <2 x i8> %tmp1, <2 x i8> %tmp2
+  %t1 = sub <2 x i8> %a, %b
+  %cmp = icmp sgt <2 x i8> %t1, <i8 -1, i8 -1>
+  %t2 = sub <2 x i8> %b, %a
+  %abs = select <2 x i1> %cmp, <2 x i8> %t1, <2 x i8> %t2
   ret <2 x i8> %abs
 }
 
 define i32 @abs_canonical_8(i32 %a) {
 ; CHECK-LABEL: @abs_canonical_8(
-; CHECK-NEXT:    [[TMP:%.*]] = sub i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = sub i32 0, [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], 0
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP]], i32 [[A]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[T]], i32 [[A]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp = sub i32 0, %a
-  %cmp = icmp slt i32 %tmp, 0
-  %abs = select i1 %cmp, i32 %a, i32 %tmp
+  %t = sub i32 0, %a
+  %cmp = icmp slt i32 %t, 0
+  %abs = select i1 %cmp, i32 %a, i32 %t
   ret i32 %abs
 }
 
 define i32 @abs_canonical_9(i32 %a, i32 %b) {
 ; CHECK-LABEL: @abs_canonical_9(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[B]], [[A]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[TMP2]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ABS]], [[TMP2]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[T1]], -1
+; CHECK-NEXT:    [[T2:%.*]] = sub i32 [[B]], [[A]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[T2]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ABS]], [[T2]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp sgt i32 %tmp1, -1
-  %tmp2 = sub i32 %b, %a
-  %abs = select i1 %cmp, i32 %tmp1, i32 %tmp2
-  %add = add i32 %abs, %tmp2 ; increase use count for %tmp2.
+  %t1 = sub i32 %a, %b
+  %cmp = icmp sgt i32 %t1, -1
+  %t2 = sub i32 %b, %a
+  %abs = select i1 %cmp, i32 %t1, i32 %t2
+  %add = add i32 %abs, %t2 ; increase use count for %t2.
   ret i32 %add
 }
 
 define i32 @abs_canonical_10(i32 %a, i32 %b) {
 ; CHECK-LABEL: @abs_canonical_10(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[NEGTMP:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[NEGTMP]], i32 [[TMP1]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[T1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[T1]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp2 = sub i32 %b, %a
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp sgt i32 %tmp1, -1
-  %abs = select i1 %cmp, i32 %tmp1, i32 %tmp2
+  %t2 = sub i32 %b, %a
+  %t1 = sub i32 %a, %b
+  %cmp = icmp sgt i32 %t1, -1
+  %abs = select i1 %cmp, i32 %t1, i32 %t2
   ret i32 %abs
 }
 
@@ -297,76 +297,76 @@ define i32 @nabs_canonical_5(i8 %x) {
 
 define i32 @nabs_canonical_6(i32 %a, i32 %b) {
 ; CHECK-LABEL: @nabs_canonical_6(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[TMP2]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[T1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp sgt i32 %tmp1, -1
-  %tmp2 = sub i32 %b, %a
-  %abs = select i1 %cmp, i32 %tmp2, i32 %tmp1
+  %t1 = sub i32 %a, %b
+  %cmp = icmp sgt i32 %t1, -1
+  %t2 = sub i32 %b, %a
+  %abs = select i1 %cmp, i32 %t2, i32 %t1
   ret i32 %abs
 }
 
 define <2 x i8> @nabs_canonical_7(<2 x i8> %a, <2 x i8 > %b) {
 ; CHECK-LABEL: @nabs_canonical_7(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[TMP1]], <2 x i8> [[TMP2]]
+; CHECK-NEXT:    [[T1:%.*]] = sub <2 x i8> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[T1]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[T1]], <2 x i8> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i8> [[ABS]]
 ;
-  %tmp1 = sub <2 x i8> %a, %b
-  %cmp = icmp sgt <2 x i8> %tmp1, <i8 -1, i8 -1>
-  %tmp2 = sub <2 x i8> %b, %a
-  %abs = select <2 x i1> %cmp, <2 x i8> %tmp2, <2 x i8> %tmp1
+  %t1 = sub <2 x i8> %a, %b
+  %cmp = icmp sgt <2 x i8> %t1, <i8 -1, i8 -1>
+  %t2 = sub <2 x i8> %b, %a
+  %abs = select <2 x i1> %cmp, <2 x i8> %t2, <2 x i8> %t1
   ret <2 x i8> %abs
 }
 
 define i32 @nabs_canonical_8(i32 %a) {
 ; CHECK-LABEL: @nabs_canonical_8(
-; CHECK-NEXT:    [[TMP:%.*]] = sub i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[T:%.*]] = sub i32 0, [[A:%.*]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[A]], 0
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[TMP]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[T]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp = sub i32 0, %a
-  %cmp = icmp slt i32 %tmp, 0
-  %abs = select i1 %cmp, i32 %tmp, i32 %a
+  %t = sub i32 0, %a
+  %cmp = icmp slt i32 %t, 0
+  %abs = select i1 %cmp, i32 %t, i32 %a
   ret i32 %abs
 }
 
 define i32 @nabs_canonical_9(i32 %a, i32 %b) {
 ; CHECK-LABEL: @nabs_canonical_9(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[B]], [[A]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[TMP1]]
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP2]], [[ABS]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[T1]], -1
+; CHECK-NEXT:    [[T2:%.*]] = sub i32 [[B]], [[A]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[T2]], i32 [[T1]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[T2]], [[ABS]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp sgt i32 %tmp1, -1
-  %tmp2 = sub i32 %b, %a
-  %abs = select i1 %cmp, i32 %tmp2, i32 %tmp1
-  %add = add i32 %tmp2, %abs ; increase use count for %tmp2
+  %t1 = sub i32 %a, %b
+  %cmp = icmp sgt i32 %t1, -1
+  %t2 = sub i32 %b, %a
+  %abs = select i1 %cmp, i32 %t2, i32 %t1
+  %add = add i32 %t2, %abs ; increase use count for %t2
   ret i32 %add
 }
 
 define i32 @nabs_canonical_10(i32 %a, i32 %b) {
 ; CHECK-LABEL: @nabs_canonical_10(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 0
-; CHECK-NEXT:    [[NEGTMP:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 [[NEGTMP]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[T1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[T1]]
+; CHECK-NEXT:    [[ABS:%.*]] = select i1 [[CMP]], i32 [[T1]], i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[ABS]]
 ;
-  %tmp2 = sub i32 %b, %a
-  %tmp1 = sub i32 %a, %b
-  %cmp = icmp slt i32 %tmp1, 1
-  %abs = select i1 %cmp, i32 %tmp1, i32 %tmp2
+  %t2 = sub i32 %b, %a
+  %t1 = sub i32 %a, %b
+  %cmp = icmp slt i32 %t1, 1
+  %abs = select i1 %cmp, i32 %t1, i32 %t2
   ret i32 %abs
 }
 
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 378b82020d4ea..8d54717907398 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -3,7 +3,7 @@
 
 define i1 @PR1817_1(i32 %X) {
 ; CHECK-LABEL: @PR1817_1(
-; CHECK-NEXT:    [[B:%.*]] = icmp ult i32 %X, 10
+; CHECK-NEXT:    [[B:%.*]] = icmp ult i32 [[X:%.*]], 10
 ; CHECK-NEXT:    ret i1 [[B]]
 ;
   %A = icmp slt i32 %X, 10
@@ -14,7 +14,7 @@ define i1 @PR1817_1(i32 %X) {
 
 define i1 @PR1817_2(i32 %X) {
 ; CHECK-LABEL: @PR1817_2(
-; CHECK-NEXT:    [[A:%.*]] = icmp slt i32 %X, 10
+; CHECK-NEXT:    [[A:%.*]] = icmp slt i32 [[X:%.*]], 10
 ; CHECK-NEXT:    ret i1 [[A]]
 ;
   %A = icmp slt i32 %X, 10
@@ -25,7 +25,7 @@ define i1 @PR1817_2(i32 %X) {
 
 define i1 @PR2330(i32 %a, i32 %b) {
 ; CHECK-LABEL: @PR2330(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %b, %a
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 8
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -41,7 +41,7 @@ define i1 @PR2330(i32 %a, i32 %b) {
 
 define i1 @or_eq_with_one_bit_diff_constants1(i32 %x) {
 ; CHECK-LABEL: @or_eq_with_one_bit_diff_constants1(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 51
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -55,7 +55,7 @@ define i1 @or_eq_with_one_bit_diff_constants1(i32 %x) {
 
 define i1 @and_ne_with_one_bit_diff_constants1(i32 %x) {
 ; CHECK-LABEL: @and_ne_with_one_bit_diff_constants1(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 51
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -69,7 +69,7 @@ define i1 @and_ne_with_one_bit_diff_constants1(i32 %x) {
 
 define i1 @or_eq_with_one_bit_diff_constants2(i32 %x) {
 ; CHECK-LABEL: @or_eq_with_one_bit_diff_constants2(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %x, 32
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[X:%.*]], 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 97
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -81,7 +81,7 @@ define i1 @or_eq_with_one_bit_diff_constants2(i32 %x) {
 
 define i1 @and_ne_with_one_bit_diff_constants2(i19 %x) {
 ; CHECK-LABEL: @and_ne_with_one_bit_diff_constants2(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i19 %x, 128
+; CHECK-NEXT:    [[TMP1:%.*]] = or i19 [[X:%.*]], 128
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i19 [[TMP1]], 193
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -95,7 +95,7 @@ define i1 @and_ne_with_one_bit_diff_constants2(i19 %x) {
 
 define i1 @or_eq_with_one_bit_diff_constants3(i8 %x) {
 ; CHECK-LABEL: @or_eq_with_one_bit_diff_constants3(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X:%.*]], -128
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[TMP1]], -2
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -107,7 +107,7 @@ define i1 @or_eq_with_one_bit_diff_constants3(i8 %x) {
 
 define i1 @and_ne_with_one_bit_diff_constants3(i8 %x) {
 ; CHECK-LABEL: @and_ne_with_one_bit_diff_constants3(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[X:%.*]], -128
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP1]], -63
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -122,7 +122,7 @@ define i1 @and_ne_with_one_bit_diff_constants3(i8 %x) {
 
 define i1 @or_eq_with_diff_one(i8 %x) {
 ; CHECK-LABEL: @or_eq_with_diff_one(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i8 %x, -13
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X:%.*]], -13
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -136,7 +136,7 @@ define i1 @or_eq_with_diff_one(i8 %x) {
 
 define i1 @and_ne_with_diff_one(i32 %x) {
 ; CHECK-LABEL: @and_ne_with_diff_one(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 %x, -39
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], -39
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -151,7 +151,7 @@ define i1 @and_ne_with_diff_one(i32 %x) {
 
 define i1 @or_eq_with_diff_one_signed(i32 %x) {
 ; CHECK-LABEL: @or_eq_with_diff_one_signed(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 %x, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -163,7 +163,7 @@ define i1 @or_eq_with_diff_one_signed(i32 %x) {
 
 define i1 @and_ne_with_diff_one_signed(i64 %x) {
 ; CHECK-LABEL: @and_ne_with_diff_one_signed(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 %x, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[X:%.*]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -177,7 +177,7 @@ define i1 @and_ne_with_diff_one_signed(i64 %x) {
 
 define <2 x i1> @or_eq_with_one_bit_diff_constants2_splatvec(<2 x i32> %x) {
 ; CHECK-LABEL: @or_eq_with_one_bit_diff_constants2_splatvec(
-; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> %x, <i32 32, i32 32>
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[X:%.*]], <i32 32, i32 32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], <i32 97, i32 97>
 ; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
 ;
@@ -189,7 +189,7 @@ define <2 x i1> @or_eq_with_one_bit_diff_constants2_splatvec(<2 x i32> %x) {
 
 define <2 x i1> @and_ne_with_diff_one_splatvec(<2 x i32> %x) {
 ; CHECK-LABEL: @and_ne_with_diff_one_splatvec(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> %x, <i32 -39, i32 -39>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[X:%.*]], <i32 -39, i32 -39>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt <2 x i32> [[TMP1]], <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
 ;
@@ -255,7 +255,7 @@ define void @simplify_before_foldAndOfICmps() {
 
 define i1 @PR42691_1(i32 %x) {
 ; CHECK-LABEL: @PR42691_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 %x, 2147483646
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X:%.*]], 2147483646
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %c1 = icmp slt i32 %x, 0
@@ -266,7 +266,7 @@ define i1 @PR42691_1(i32 %x) {
 
 define i1 @PR42691_2(i32 %x) {
 ; CHECK-LABEL: @PR42691_2(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 %x, -2
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[X:%.*]], -2
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %c1 = icmp ult i32 %x, 2147483648
@@ -277,7 +277,7 @@ define i1 @PR42691_2(i32 %x) {
 
 define i1 @PR42691_3(i32 %x) {
 ; CHECK-LABEL: @PR42691_3(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 %x, -2147483647
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], -2147483647
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %c1 = icmp sge i32 %x, 0
@@ -288,7 +288,7 @@ define i1 @PR42691_3(i32 %x) {
 
 define i1 @PR42691_4(i32 %x) {
 ; CHECK-LABEL: @PR42691_4(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 %x, 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[X:%.*]], 1
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
   %c1 = icmp uge i32 %x, 2147483648
@@ -299,7 +299,7 @@ define i1 @PR42691_4(i32 %x) {
 
 define i1 @PR42691_5(i32 %x) {
 ; CHECK-LABEL: @PR42691_5(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 %x, -1
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X_OFF]], 2147483645
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
@@ -311,7 +311,7 @@ define i1 @PR42691_5(i32 %x) {
 
 define i1 @PR42691_6(i32 %x) {
 ; CHECK-LABEL: @PR42691_6(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 %x, 2147483647
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], 2147483647
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[X_OFF]], 2147483645
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
@@ -323,7 +323,7 @@ define i1 @PR42691_6(i32 %x) {
 
 define i1 @PR42691_7(i32 %x) {
 ; CHECK-LABEL: @PR42691_7(
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 %x, -1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X:%.*]], -1
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
@@ -335,7 +335,7 @@ define i1 @PR42691_7(i32 %x) {
 
 define i1 @PR42691_8(i32 %x) {
 ; CHECK-LABEL: @PR42691_8(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 %x, 2147483647
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], 2147483647
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X_OFF]], -2147483635
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
@@ -347,7 +347,7 @@ define i1 @PR42691_8(i32 %x) {
 
 define i1 @PR42691_9(i32 %x) {
 ; CHECK-LABEL: @PR42691_9(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 %x, -14
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], -14
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X_OFF]], 2147483633
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
@@ -359,7 +359,7 @@ define i1 @PR42691_9(i32 %x) {
 
 define i1 @PR42691_10(i32 %x) {
 ; CHECK-LABEL: @PR42691_10(
-; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 %x, -14
+; CHECK-NEXT:    [[X_OFF:%.*]] = add i32 [[X:%.*]], -14
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[X_OFF]], -15
 ; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/apint-sub.ll b/llvm/test/Transforms/InstCombine/apint-sub.ll
index 8d80f2c845b15..e4094f1d42909 100644
--- a/llvm/test/Transforms/InstCombine/apint-sub.ll
+++ b/llvm/test/Transforms/InstCombine/apint-sub.ll
@@ -11,7 +11,7 @@ define i23 @test1(i23 %A) {
 
 define i47 @test2(i47 %A) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i47 %A
+; CHECK-NEXT:    ret i47 [[A:%.*]]
 ;
   %B = sub i47 %A, 0
   ret i47 %B
@@ -19,7 +19,7 @@ define i47 @test2(i47 %A) {
 
 define i97 @test3(i97 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i97 %A
+; CHECK-NEXT:    ret i97 [[A:%.*]]
 ;
   %B = sub i97 0, %A
   %C = sub i97 0, %B
@@ -28,7 +28,7 @@ define i97 @test3(i97 %A) {
 
 define i108 @test4(i108 %A, i108 %x) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = add i108 %x, %A
+; CHECK-NEXT:    [[C:%.*]] = add i108 [[X:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i108 [[C]]
 ;
   %B = sub i108 0, %A
@@ -38,8 +38,8 @@ define i108 @test4(i108 %A, i108 %x) {
 
 define i19 @test5(i19 %A, i19 %Bok, i19 %Cok) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[D1:%.*]] = sub i19 %Cok, %Bok
-; CHECK-NEXT:    [[E:%.*]] = add i19 [[D1]], %A
+; CHECK-NEXT:    [[D1:%.*]] = sub i19 [[COK:%.*]], [[BOK:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = add i19 [[D1]], [[A:%.*]]
 ; CHECK-NEXT:    ret i19 [[E]]
 ;
   %D = sub i19 %Bok, %Cok
@@ -49,8 +49,8 @@ define i19 @test5(i19 %A, i19 %Bok, i19 %Cok) {
 
 define i57 @test6(i57 %A, i57 %B) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[B_NOT:%.*]] = xor i57 %B, -1
-; CHECK-NEXT:    [[D:%.*]] = and i57 [[B_NOT]], %A
+; CHECK-NEXT:    [[B_NOT:%.*]] = xor i57 [[B:%.*]], -1
+; CHECK-NEXT:    [[D:%.*]] = and i57 [[B_NOT]], [[A:%.*]]
 ; CHECK-NEXT:    ret i57 [[D]]
 ;
   %C = and i57 %A, %B
@@ -60,7 +60,7 @@ define i57 @test6(i57 %A, i57 %B) {
 
 define i77 @test7(i77 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[B:%.*]] = xor i77 %A, -1
+; CHECK-NEXT:    [[B:%.*]] = xor i77 [[A:%.*]], -1
 ; CHECK-NEXT:    ret i77 [[B]]
 ;
   %B = sub i77 -1, %A
@@ -69,7 +69,7 @@ define i77 @test7(i77 %A) {
 
 define i27 @test8(i27 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[C:%.*]] = shl i27 %A, 3
+; CHECK-NEXT:    [[C:%.*]] = shl i27 [[A:%.*]], 3
 ; CHECK-NEXT:    ret i27 [[C]]
 ;
   %B = mul i27 9, %A
@@ -79,7 +79,7 @@ define i27 @test8(i27 %A) {
 
 define i42 @test9(i42 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[C:%.*]] = mul i42 %A, -2
+; CHECK-NEXT:    [[C:%.*]] = mul i42 [[A:%.*]], -2
 ; CHECK-NEXT:    ret i42 [[C]]
 ;
   %B = mul i42 3, %A
@@ -89,7 +89,7 @@ define i42 @test9(i42 %A) {
 
 define i1 @test11(i9 %A, i9 %B) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[CD:%.*]] = icmp ne i9 %A, %B
+; CHECK-NEXT:    [[CD:%.*]] = icmp ne i9 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i1 [[CD]]
 ;
   %C = sub i9 %A, %B
@@ -99,7 +99,7 @@ define i1 @test11(i9 %A, i9 %B) {
 
 define i43 @test12(i43 %A) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[C:%.*]] = lshr i43 %A, 42
+; CHECK-NEXT:    [[C:%.*]] = lshr i43 [[A:%.*]], 42
 ; CHECK-NEXT:    ret i43 [[C]]
 ;
   %B = ashr i43 %A, 42
@@ -109,7 +109,7 @@ define i43 @test12(i43 %A) {
 
 define i79 @test13(i79 %A) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[C:%.*]] = ashr i79 %A, 78
+; CHECK-NEXT:    [[C:%.*]] = ashr i79 [[A:%.*]], 78
 ; CHECK-NEXT:    ret i79 [[C]]
 ;
   %B = lshr i79 %A, 78
@@ -119,7 +119,7 @@ define i79 @test13(i79 %A) {
 
 define i1024 @test14(i1024 %A) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[D:%.*]] = ashr i1024 %A, 1023
+; CHECK-NEXT:    [[D:%.*]] = ashr i1024 [[A:%.*]], 1023
 ; CHECK-NEXT:    ret i1024 [[D]]
 ;
   %B = lshr i1024 %A, 1023
@@ -130,7 +130,7 @@ define i1024 @test14(i1024 %A) {
 
 define i51 @test16(i51 %A) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[Y:%.*]] = sdiv i51 %A, -1123
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i51 [[A:%.*]], -1123
 ; CHECK-NEXT:    ret i51 [[Y]]
 ;
   %X = sdiv i51 %A, 1123
@@ -142,7 +142,7 @@ define i51 @test16(i51 %A) {
 ; PR3142
 define i25 @test17(i25 %Aok) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT:    [[B:%.*]] = sub i25 0, %Aok
+; CHECK-NEXT:    [[B:%.*]] = sub i25 0, [[AOK:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = sdiv i25 [[B]], 1234
 ; CHECK-NEXT:    ret i25 [[C]]
 ;
@@ -163,7 +163,7 @@ define i128 @test18(i128 %Y) {
 
 define i39 @test19(i39 %X, i39 %Y) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    ret i39 %X
+; CHECK-NEXT:    ret i39 [[X:%.*]]
 ;
   %Z = sub i39 %X, %Y
   %Q = add i39 %Z, %Y
@@ -172,7 +172,7 @@ define i39 @test19(i39 %X, i39 %Y) {
 
 define i1 @test20(i33 %g, i33 %h) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT:    [[T4:%.*]] = icmp ne i33 %h, 0
+; CHECK-NEXT:    [[T4:%.*]] = icmp ne i33 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[T4]]
 ;
   %t2 = sub i33 %g, %h
@@ -182,7 +182,7 @@ define i1 @test20(i33 %g, i33 %h) {
 
 define i1 @test21(i256 %g, i256 %h) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[T4:%.*]] = icmp ne i256 %h, 0
+; CHECK-NEXT:    [[T4:%.*]] = icmp ne i256 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[T4]]
 ;
   %t2 = sub i256 %g, %h
diff --git a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 21c8e78bbb165..50cbd763987d3 100644
--- a/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -40,7 +40,8 @@ define <1 x i64> @d(i64 %y) {
 
 define x86_mmx @e(<1 x i64> %y) {
 ; CHECK-LABEL: @e(
-; CHECK-NEXT:    [[C:%.*]] = bitcast <1 x i64> %y to x86_mmx
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> %y, i32 0
+; CHECK-NEXT:    [[C:%.*]] = bitcast i64 [[TMP1]] to x86_mmx
 ; CHECK-NEXT:    ret x86_mmx [[C]]
 ;
   %c = bitcast <1 x i64> %y to x86_mmx
@@ -49,7 +50,8 @@ define x86_mmx @e(<1 x i64> %y) {
 
 define <1 x i64> @f(x86_mmx %y) {
 ; CHECK-LABEL: @f(
-; CHECK-NEXT:    [[C:%.*]] = bitcast x86_mmx [[Y:%.*]] to <1 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast x86_mmx %y to i64
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret <1 x i64> [[C]]
 ;
   %c = bitcast x86_mmx %y to <1 x i64>
@@ -59,7 +61,7 @@ define <1 x i64> @f(x86_mmx %y) {
 define double @g(x86_mmx %x) {
 ; CHECK-LABEL: @g(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx [[X:%.*]] to double
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast x86_mmx %x to double
 ; CHECK-NEXT:    ret double [[TMP0]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/builtin-dynamic-object-size.ll b/llvm/test/Transforms/InstCombine/builtin-dynamic-object-size.ll
index eabe3a4c4b7a6..4093a121060c4 100644
--- a/llvm/test/Transforms/InstCombine/builtin-dynamic-object-size.ll
+++ b/llvm/test/Transforms/InstCombine/builtin-dynamic-object-size.ll
@@ -1,4 +1,4 @@
-; RUN: opt -instcombine -S < %s | FileCheck %s --dump-input-on-failure
+; RUN: opt -instcombine -S < %s | FileCheck %s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.14.0"
@@ -48,10 +48,8 @@ entry:
 
 ; CHECK:      define i64 @internal_pointer(i64 %sz)
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   %0 = add i64 %sz, -2
-; CHECK-NEXT:   %1 = icmp ult i64 %sz, 2
-; CHECK-NEXT:   %2 = select i1 %1, i64 0, i64 %0
-; CHECK-NEXT:   ret i64 %2
+; CHECK-NEXT:   %0 = call i64 @llvm.usub.sat.i64(i64 %sz, i64 2)
+; CHECK-NEXT:   ret i64 %0
 ; CHECK-NEXT: }
 
 define i64 @uses_nullptr_no_fold() {
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index fd35bd92dd7dc..d85286b46029a 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Tests to make sure elimination of casts is working correctly
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64"
+; RUN: opt < %s -instcombine -S -data-layout="E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -instcombine -S -data-layout="e-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" | FileCheck %s --check-prefixes=ALL,LE
 
 @inbuf = external global [32832 x i8]
 
 define i32 @test1(i32 %A) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    ret i32 [[A:%.*]]
+; ALL-LABEL: @test1(
+; ALL-NEXT:    ret i32 [[A:%.*]]
 ;
   %c1 = bitcast i32 %A to i32
   %c2 = bitcast i32 %c1 to i32
@@ -15,9 +15,9 @@ define i32 @test1(i32 %A) {
 }
 
 define i64 @test2(i8 %A) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[RET:%.*]] = zext i8 [[A:%.*]] to i64
-; CHECK-NEXT:    ret i64 [[RET]]
+; ALL-LABEL: @test2(
+; ALL-NEXT:    [[RET:%.*]] = zext i8 [[A:%.*]] to i64
+; ALL-NEXT:    ret i64 [[RET]]
 ;
   %c1 = zext i8 %A to i16
   %c2 = zext i16 %c1 to i32
@@ -26,9 +26,9 @@ define i64 @test2(i8 %A) {
 }
 
 define i64 @test3(i64 %A) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C2:%.*]] = and i64 [[A:%.*]], 255
-; CHECK-NEXT:    ret i64 [[C2]]
+; ALL-LABEL: @test3(
+; ALL-NEXT:    [[C2:%.*]] = and i64 [[A:%.*]], 255
+; ALL-NEXT:    ret i64 [[C2]]
 ;
   %c1 = trunc i64 %A to i8
   %c2 = zext i8 %c1 to i64
@@ -36,10 +36,10 @@ define i64 @test3(i64 %A) {
 }
 
 define i32 @test4(i32 %A, i32 %B) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[RESULT:%.*]] = zext i1 [[COND]] to i32
-; CHECK-NEXT:    ret i32 [[RESULT]]
+; ALL-LABEL: @test4(
+; ALL-NEXT:    [[COND:%.*]] = icmp slt i32 [[A:%.*]], [[B:%.*]]
+; ALL-NEXT:    [[RESULT:%.*]] = zext i1 [[COND]] to i32
+; ALL-NEXT:    ret i32 [[RESULT]]
 ;
   %COND = icmp slt i32 %A, %B
   %c = zext i1 %COND to i8
@@ -48,9 +48,9 @@ define i32 @test4(i32 %A, i32 %B) {
 }
 
 define i32 @test5(i1 %B) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[RESULT:%.*]] = zext i1 [[B:%.*]] to i32
-; CHECK-NEXT:    ret i32 [[RESULT]]
+; ALL-LABEL: @test5(
+; ALL-NEXT:    [[RESULT:%.*]] = zext i1 [[B:%.*]] to i32
+; ALL-NEXT:    ret i32 [[RESULT]]
 ;
   %c = zext i1 %B to i8
   %result = zext i8 %c to i32
@@ -58,9 +58,9 @@ define i32 @test5(i1 %B) {
 }
 
 define i32 @test6(i64 %A) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[C1:%.*]] = trunc i64 [[A:%.*]] to i32
-; CHECK-NEXT:    ret i32 [[C1]]
+; ALL-LABEL: @test6(
+; ALL-NEXT:    [[C1:%.*]] = trunc i64 [[A:%.*]] to i32
+; ALL-NEXT:    ret i32 [[C1]]
 ;
   %c1 = trunc i64 %A to i32
   %res = bitcast i32 %c1 to i32
@@ -68,9 +68,9 @@ define i32 @test6(i64 %A) {
 }
 
 define i64 @test7(i1 %A) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[RES:%.*]] = zext i1 [[A:%.*]] to i64
-; CHECK-NEXT:    ret i64 [[RES]]
+; ALL-LABEL: @test7(
+; ALL-NEXT:    [[RES:%.*]] = zext i1 [[A:%.*]] to i64
+; ALL-NEXT:    ret i64 [[RES]]
 ;
   %c1 = zext i1 %A to i32
   %res = sext i32 %c1 to i64
@@ -78,9 +78,9 @@ define i64 @test7(i1 %A) {
 }
 
 define i64 @test8(i8 %A) {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[C1:%.*]] = sext i8 [[A:%.*]] to i64
-; CHECK-NEXT:    ret i64 [[C1]]
+; ALL-LABEL: @test8(
+; ALL-NEXT:    [[C1:%.*]] = sext i8 [[A:%.*]] to i64
+; ALL-NEXT:    ret i64 [[C1]]
 ;
   %c1 = sext i8 %A to i64
   %res = bitcast i64 %c1 to i64
@@ -88,8 +88,8 @@ define i64 @test8(i8 %A) {
 }
 
 define i16 @test9(i16 %A) {
-; CHECK-LABEL: @test9(
-; CHECK-NEXT:    ret i16 [[A:%.*]]
+; ALL-LABEL: @test9(
+; ALL-NEXT:    ret i16 [[A:%.*]]
 ;
   %c1 = sext i16 %A to i32
   %c2 = trunc i32 %c1 to i16
@@ -97,8 +97,8 @@ define i16 @test9(i16 %A) {
 }
 
 define i16 @test10(i16 %A) {
-; CHECK-LABEL: @test10(
-; CHECK-NEXT:    ret i16 [[A:%.*]]
+; ALL-LABEL: @test10(
+; ALL-NEXT:    ret i16 [[A:%.*]]
 ;
   %c1 = sext i16 %A to i32
   %c2 = trunc i32 %c1 to i16
@@ -108,9 +108,9 @@ define i16 @test10(i16 %A) {
 declare void @varargs(i32, ...)
 
 define void @test11(i32* %P) {
-; CHECK-LABEL: @test11(
-; CHECK-NEXT:    call void (i32, ...) @varargs(i32 5, i32* [[P:%.*]])
-; CHECK-NEXT:    ret void
+; ALL-LABEL: @test11(
+; ALL-NEXT:    call void (i32, ...) @varargs(i32 5, i32* [[P:%.*]])
+; ALL-NEXT:    ret void
 ;
   %c = bitcast i32* %P to i16*
   call void (i32, ...) @varargs( i32 5, i16* %c )
@@ -119,16 +119,16 @@ define void @test11(i32* %P) {
 
 declare i32 @__gxx_personality_v0(...)
 define void @test_invoke_vararg_cast(i32* %a, i32* %b) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
-; CHECK-LABEL: @test_invoke_vararg_cast(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    invoke void (i32, ...) @varargs(i32 1, i32* [[B:%.*]], i32* [[A:%.*]])
-; CHECK-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
-; CHECK:       invoke.cont:
-; CHECK-NEXT:    ret void
-; CHECK:       lpad:
-; CHECK-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
-; CHECK-NEXT:    cleanup
-; CHECK-NEXT:    ret void
+; ALL-LABEL: @test_invoke_vararg_cast(
+; ALL-NEXT:  entry:
+; ALL-NEXT:    invoke void (i32, ...) @varargs(i32 1, i32* [[B:%.*]], i32* [[A:%.*]])
+; ALL-NEXT:    to label [[INVOKE_CONT:%.*]] unwind label [[LPAD:%.*]]
+; ALL:       invoke.cont:
+; ALL-NEXT:    ret void
+; ALL:       lpad:
+; ALL-NEXT:    [[TMP0:%.*]] = landingpad { i8*, i32 }
+; ALL-NEXT:    cleanup
+; ALL-NEXT:    ret void
 ;
 entry:
   %0 = bitcast i32* %b to i8*
@@ -146,18 +146,18 @@ lpad:
 }
 
 define i8* @test13(i64 %A) {
-; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [32832 x i8], [32832 x i8]* @inbuf, i64 0, i64 [[A:%.*]]
-; CHECK-NEXT:    ret i8* [[C]]
+; ALL-LABEL: @test13(
+; ALL-NEXT:    [[C:%.*]] = getelementptr [32832 x i8], [32832 x i8]* @inbuf, i64 0, i64 [[A:%.*]]
+; ALL-NEXT:    ret i8* [[C]]
 ;
   %c = getelementptr [0 x i8], [0 x i8]* bitcast ([32832 x i8]* @inbuf to [0 x i8]*), i64 0, i64 %A
   ret i8* %c
 }
 
 define i1 @test14(i8 %A) {
-; CHECK-LABEL: @test14(
-; CHECK-NEXT:    [[X:%.*]] = icmp sgt i8 [[A:%.*]], -1
-; CHECK-NEXT:    ret i1 [[X]]
+; ALL-LABEL: @test14(
+; ALL-NEXT:    [[X:%.*]] = icmp sgt i8 [[A:%.*]], -1
+; ALL-NEXT:    ret i1 [[X]]
 ;
   %c = bitcast i8 %A to i8
   %X = icmp ult i8 %c, -128
@@ -173,18 +173,18 @@ define i1 @test14(i8 %A) {
 ;}
 
 define i1 @test16(i32* %P) {
-; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i32* [[P:%.*]], null
-; CHECK-NEXT:    ret i1 [[C]]
+; ALL-LABEL: @test16(
+; ALL-NEXT:    [[C:%.*]] = icmp ne i32* [[P:%.*]], null
+; ALL-NEXT:    ret i1 [[C]]
 ;
   %c = icmp ne i32* %P, null
   ret i1 %c
 }
 
 define i16 @test17(i1 %x) {
-; CHECK-LABEL: @test17(
-; CHECK-NEXT:    [[T86:%.*]] = zext i1 [[X:%.*]] to i16
-; CHECK-NEXT:    ret i16 [[T86]]
+; ALL-LABEL: @test17(
+; ALL-NEXT:    [[T86:%.*]] = zext i1 [[X:%.*]] to i16
+; ALL-NEXT:    ret i16 [[T86]]
 ;
   %c = zext i1 %x to i32
   %t86 = trunc i32 %c to i16
@@ -192,9 +192,9 @@ define i16 @test17(i1 %x) {
 }
 
 define i16 @test18(i8 %x) {
-; CHECK-LABEL: @test18(
-; CHECK-NEXT:    [[T86:%.*]] = sext i8 [[X:%.*]] to i16
-; CHECK-NEXT:    ret i16 [[T86]]
+; ALL-LABEL: @test18(
+; ALL-NEXT:    [[T86:%.*]] = sext i8 [[X:%.*]] to i16
+; ALL-NEXT:    ret i16 [[T86]]
 ;
   %c = sext i8 %x to i32
   %t86 = trunc i32 %c to i16
@@ -202,9 +202,9 @@ define i16 @test18(i8 %x) {
 }
 
 define i1 @test19(i32 %X) {
-; CHECK-LABEL: @test19(
-; CHECK-NEXT:    [[Z:%.*]] = icmp slt i32 [[X:%.*]], 12345
-; CHECK-NEXT:    ret i1 [[Z]]
+; ALL-LABEL: @test19(
+; ALL-NEXT:    [[Z:%.*]] = icmp slt i32 [[X:%.*]], 12345
+; ALL-NEXT:    ret i1 [[Z]]
 ;
   %c = sext i32 %X to i64
   %Z = icmp slt i64 %c, 12345
@@ -212,9 +212,9 @@ define i1 @test19(i32 %X) {
 }
 
 define <2 x i1> @test19vec(<2 x i32> %X) {
-; CHECK-LABEL: @test19vec(
-; CHECK-NEXT:    [[Z:%.*]] = icmp slt <2 x i32> [[X:%.*]], <i32 12345, i32 2147483647>
-; CHECK-NEXT:    ret <2 x i1> [[Z]]
+; ALL-LABEL: @test19vec(
+; ALL-NEXT:    [[Z:%.*]] = icmp slt <2 x i32> [[X:%.*]], <i32 12345, i32 2147483647>
+; ALL-NEXT:    ret <2 x i1> [[Z]]
 ;
   %c = sext <2 x i32> %X to <2 x i64>
   %Z = icmp slt <2 x i64> %c, <i64 12345, i64 2147483647>
@@ -222,9 +222,9 @@ define <2 x i1> @test19vec(<2 x i32> %X) {
 }
 
 define <3 x i1> @test19vec2(<3 x i1> %X) {
-; CHECK-LABEL: @test19vec2(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = xor <3 x i1> [[X:%.*]], <i1 true, i1 true, i1 true>
-; CHECK-NEXT:    ret <3 x i1> [[CMPEQ]]
+; ALL-LABEL: @test19vec2(
+; ALL-NEXT:    [[CMPEQ:%.*]] = xor <3 x i1> [[X:%.*]], <i1 true, i1 true, i1 true>
+; ALL-NEXT:    ret <3 x i1> [[CMPEQ]]
 ;
   %sext = sext <3 x i1> %X to <3 x i32>
   %cmpeq = icmp eq <3 x i32> %sext, zeroinitializer
@@ -232,8 +232,8 @@ define <3 x i1> @test19vec2(<3 x i1> %X) {
 }
 
 define i1 @test20(i1 %B) {
-; CHECK-LABEL: @test20(
-; CHECK-NEXT:    ret i1 false
+; ALL-LABEL: @test20(
+; ALL-NEXT:    ret i1 false
 ;
   %c = zext i1 %B to i32
   %D = icmp slt i32 %c, -1
@@ -241,9 +241,9 @@ define i1 @test20(i1 %B) {
 }
 
 define i32 @test21(i32 %X) {
-; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 255
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; ALL-LABEL: @test21(
+; ALL-NEXT:    [[TMP1:%.*]] = and i32 [[X:%.*]], 255
+; ALL-NEXT:    ret i32 [[TMP1]]
 ;
   %c1 = trunc i32 %X to i8
   %c2 = sext i8 %c1 to i32
@@ -252,9 +252,9 @@ define i32 @test21(i32 %X) {
 }
 
 define i32 @test22(i32 %X) {
-; CHECK-LABEL: @test22(
-; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[X:%.*]], 24
-; CHECK-NEXT:    ret i32 [[SEXT]]
+; ALL-LABEL: @test22(
+; ALL-NEXT:    [[SEXT:%.*]] = shl i32 [[X:%.*]], 24
+; ALL-NEXT:    ret i32 [[SEXT]]
 ;
   %c1 = trunc i32 %X to i8
   %c2 = sext i8 %c1 to i32
@@ -263,9 +263,9 @@ define i32 @test22(i32 %X) {
 }
 
 define i32 @test23(i32 %X) {
-; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[C2:%.*]] = and i32 [[X:%.*]], 65535
-; CHECK-NEXT:    ret i32 [[C2]]
+; ALL-LABEL: @test23(
+; ALL-NEXT:    [[C2:%.*]] = and i32 [[X:%.*]], 65535
+; ALL-NEXT:    ret i32 [[C2]]
 ;
   %c1 = trunc i32 %X to i16
   %c2 = zext i16 %c1 to i32
@@ -273,8 +273,8 @@ define i32 @test23(i32 %X) {
 }
 
 define i1 @test24(i1 %C) {
-; CHECK-LABEL: @test24(
-; CHECK-NEXT:    ret i1 true
+; ALL-LABEL: @test24(
+; ALL-NEXT:    ret i1 true
 ;
   %X = select i1 %C, i32 14, i32 1234
   %c = icmp ne i32 %X, 0
@@ -282,9 +282,9 @@ define i1 @test24(i1 %C) {
 }
 
 define i32 @test26(float %F) {
-; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[D:%.*]] = fptosi float [[F:%.*]] to i32
-; CHECK-NEXT:    ret i32 [[D]]
+; ALL-LABEL: @test26(
+; ALL-NEXT:    [[D:%.*]] = fptosi float [[F:%.*]] to i32
+; ALL-NEXT:    ret i32 [[D]]
 ;
   %c = fpext float %F to double
   %D = fptosi double %c to i32
@@ -292,28 +292,28 @@ define i32 @test26(float %F) {
 }
 
 define [4 x float]* @test27([9 x [4 x float]]* %A) {
-; CHECK-LABEL: @test27(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [9 x [4 x float]], [9 x [4 x float]]* [[A:%.*]], i64 0, i64 0
-; CHECK-NEXT:    ret [4 x float]* [[C]]
+; ALL-LABEL: @test27(
+; ALL-NEXT:    [[C:%.*]] = getelementptr [9 x [4 x float]], [9 x [4 x float]]* [[A:%.*]], i64 0, i64 0
+; ALL-NEXT:    ret [4 x float]* [[C]]
 ;
   %c = bitcast [9 x [4 x float]]* %A to [4 x float]*
   ret [4 x float]* %c
 }
 
 define float* @test28([4 x float]* %A) {
-; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [4 x float], [4 x float]* [[A:%.*]], i64 0, i64 0
-; CHECK-NEXT:    ret float* [[C]]
+; ALL-LABEL: @test28(
+; ALL-NEXT:    [[C:%.*]] = getelementptr [4 x float], [4 x float]* [[A:%.*]], i64 0, i64 0
+; ALL-NEXT:    ret float* [[C]]
 ;
   %c = bitcast [4 x float]* %A to float*
   ret float* %c
 }
 
 define i32 @test29(i32 %c1, i32 %c2) {
-; CHECK-LABEL: @test29(
-; CHECK-NEXT:    [[T21:%.*]] = or i32 [[C2:%.*]], [[C1:%.*]]
-; CHECK-NEXT:    [[T10:%.*]] = and i32 [[T21]], 255
-; CHECK-NEXT:    ret i32 [[T10]]
+; ALL-LABEL: @test29(
+; ALL-NEXT:    [[T21:%.*]] = or i32 [[C2:%.*]], [[C1:%.*]]
+; ALL-NEXT:    [[T10:%.*]] = and i32 [[T21]], 255
+; ALL-NEXT:    ret i32 [[T10]]
 ;
   %t1 = trunc i32 %c1 to i8
   %tmask = trunc i32 %c2 to i8
@@ -323,10 +323,10 @@ define i32 @test29(i32 %c1, i32 %c2) {
 }
 
 define i32 @test30(i32 %c1) {
-; CHECK-LABEL: @test30(
-; CHECK-NEXT:    [[C3:%.*]] = and i32 [[C1:%.*]], 255
-; CHECK-NEXT:    [[C4:%.*]] = xor i32 [[C3]], 1
-; CHECK-NEXT:    ret i32 [[C4]]
+; ALL-LABEL: @test30(
+; ALL-NEXT:    [[C3:%.*]] = and i32 [[C1:%.*]], 255
+; ALL-NEXT:    [[C4:%.*]] = xor i32 [[C3]], 1
+; ALL-NEXT:    ret i32 [[C4]]
 ;
   %c2 = trunc i32 %c1 to i8
   %c3 = xor i8 %c2, 1
@@ -335,10 +335,10 @@ define i32 @test30(i32 %c1) {
 }
 
 define i1 @test31(i64 %A) {
-; CHECK-LABEL: @test31(
-; CHECK-NEXT:    [[C1:%.*]] = and i64 [[A:%.*]], 42
-; CHECK-NEXT:    [[D:%.*]] = icmp eq i64 [[C1]], 10
-; CHECK-NEXT:    ret i1 [[D]]
+; ALL-LABEL: @test31(
+; ALL-NEXT:    [[C1:%.*]] = and i64 [[A:%.*]], 42
+; ALL-NEXT:    [[D:%.*]] = icmp eq i64 [[C1]], 10
+; ALL-NEXT:    ret i1 [[D]]
 ;
   %B = trunc i64 %A to i32
   %C = and i32 %B, 42
@@ -349,11 +349,11 @@ define i1 @test31(i64 %A) {
 ; FIXME: Vectors should fold too...or not?
 ; Does this depend on the whether the source/dest types of the trunc are legal in the data layout?
 define <2 x i1> @test31vec(<2 x i64> %A) {
-; CHECK-LABEL: @test31vec(
-; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[C:%.*]] = and <2 x i32> [[B]], <i32 42, i32 42>
-; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i32> [[C]], <i32 10, i32 10>
-; CHECK-NEXT:    ret <2 x i1> [[D]]
+; ALL-LABEL: @test31vec(
+; ALL-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; ALL-NEXT:    [[C:%.*]] = and <2 x i32> [[B]], <i32 42, i32 42>
+; ALL-NEXT:    [[D:%.*]] = icmp eq <2 x i32> [[C]], <i32 10, i32 10>
+; ALL-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = trunc <2 x i64> %A to <2 x i32>
   %C = and <2 x i32> %B, <i32 42, i32 42>
@@ -365,10 +365,10 @@ define <2 x i1> @test31vec(<2 x i64> %A) {
 ; even for vectors. Earlier folds should ensure that the icmp(and(zext)) pattern never occurs.
 
 define <2 x i1> @test32vec(<2 x i8> %A) {
-; CHECK-LABEL: @test32vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[A:%.*]], <i8 42, i8 42>
-; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i8> [[TMP1]], <i8 10, i8 10>
-; CHECK-NEXT:    ret <2 x i1> [[D]]
+; ALL-LABEL: @test32vec(
+; ALL-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[A:%.*]], <i8 42, i8 42>
+; ALL-NEXT:    [[D:%.*]] = icmp eq <2 x i8> [[TMP1]], <i8 10, i8 10>
+; ALL-NEXT:    ret <2 x i1> [[D]]
 ;
   %B = zext <2 x i8> %A to <2 x i16>
   %C = and <2 x i16> %B, <i16 42, i16 42>
@@ -377,8 +377,8 @@ define <2 x i1> @test32vec(<2 x i8> %A) {
 }
 
 define i32 @test33(i32 %c1) {
-; CHECK-LABEL: @test33(
-; CHECK-NEXT:    ret i32 [[C1:%.*]]
+; ALL-LABEL: @test33(
+; ALL-NEXT:    ret i32 [[C1:%.*]]
 ;
   %x = bitcast i32 %c1 to float
   %y = bitcast float %x to i32
@@ -386,9 +386,9 @@ define i32 @test33(i32 %c1) {
 }
 
 define i16 @test34(i16 %a) {
-; CHECK-LABEL: @test34(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i16 [[A:%.*]], 8
-; CHECK-NEXT:    ret i16 [[TMP1]]
+; ALL-LABEL: @test34(
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i16 [[A:%.*]], 8
+; ALL-NEXT:    ret i16 [[TMP1]]
 ;
   %c1 = zext i16 %a to i32
   %t21 = lshr i32 %c1, 8
@@ -397,9 +397,9 @@ define i16 @test34(i16 %a) {
 }
 
 define i16 @test35(i16 %a) {
-; CHECK-LABEL: @test35(
-; CHECK-NEXT:    [[T2:%.*]] = lshr i16 [[A:%.*]], 8
-; CHECK-NEXT:    ret i16 [[T2]]
+; ALL-LABEL: @test35(
+; ALL-NEXT:    [[T2:%.*]] = lshr i16 [[A:%.*]], 8
+; ALL-NEXT:    ret i16 [[T2]]
 ;
   %c1 = bitcast i16 %a to i16
   %t2 = lshr i16 %c1, 8
@@ -409,9 +409,9 @@ define i16 @test35(i16 %a) {
 
 ; rdar://6480391
 define i1 @test36(i32 %a) {
-; CHECK-LABEL: @test36(
-; CHECK-NEXT:    [[D:%.*]] = icmp sgt i32 [[A:%.*]], -1
-; CHECK-NEXT:    ret i1 [[D]]
+; ALL-LABEL: @test36(
+; ALL-NEXT:    [[D:%.*]] = icmp sgt i32 [[A:%.*]], -1
+; ALL-NEXT:    ret i1 [[D]]
 ;
   %b = lshr i32 %a, 31
   %c = trunc i32 %b to i8
@@ -420,9 +420,9 @@ define i1 @test36(i32 %a) {
 }
 
 define <2 x i1> @test36vec(<2 x i32> %a) {
-; CHECK-LABEL: @test36vec(
-; CHECK-NEXT:    [[D:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
-; CHECK-NEXT:    ret <2 x i1> [[D]]
+; ALL-LABEL: @test36vec(
+; ALL-NEXT:    [[D:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 -1, i32 -1>
+; ALL-NEXT:    ret <2 x i1> [[D]]
 ;
   %b = lshr <2 x i32> %a, <i32 31, i32 31>
   %c = trunc <2 x i32> %b to <2 x i8>
@@ -431,8 +431,8 @@ define <2 x i1> @test36vec(<2 x i32> %a) {
 }
 
 define i1 @test37(i32 %a) {
-; CHECK-LABEL: @test37(
-; CHECK-NEXT:    ret i1 false
+; ALL-LABEL: @test37(
+; ALL-NEXT:    ret i1 false
 ;
   %b = lshr i32 %a, 31
   %c = or i32 %b, 512
@@ -442,10 +442,10 @@ define i1 @test37(i32 %a) {
 }
 
 define i64 @test38(i32 %a) {
-; CHECK-LABEL: @test38(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], -2
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i64
-; CHECK-NEXT:    ret i64 [[TMP2]]
+; ALL-LABEL: @test38(
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], -2
+; ALL-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i64
+; ALL-NEXT:    ret i64 [[TMP2]]
 ;
   %1 = icmp eq i32 %a, -2
   %2 = zext i1 %1 to i8
@@ -455,9 +455,9 @@ define i64 @test38(i32 %a) {
 }
 
 define i16 @test39(i16 %a) {
-; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
-; CHECK-NEXT:    ret i16 [[REV]]
+; ALL-LABEL: @test39(
+; ALL-NEXT:    [[REV:%.*]] = call i16 @llvm.bswap.i16(i16 [[A:%.*]])
+; ALL-NEXT:    ret i16 [[REV]]
 ;
   %t = zext i16 %a to i32
   %t21 = lshr i32 %t, 8
@@ -468,11 +468,11 @@ define i16 @test39(i16 %a) {
 }
 
 define i16 @test40(i16 %a) {
-; CHECK-LABEL: @test40(
-; CHECK-NEXT:    [[T21:%.*]] = lshr i16 [[A:%.*]], 9
-; CHECK-NEXT:    [[T5:%.*]] = shl i16 [[A]], 8
-; CHECK-NEXT:    [[T32:%.*]] = or i16 [[T21]], [[T5]]
-; CHECK-NEXT:    ret i16 [[T32]]
+; ALL-LABEL: @test40(
+; ALL-NEXT:    [[T21:%.*]] = lshr i16 [[A:%.*]], 9
+; ALL-NEXT:    [[T5:%.*]] = shl i16 [[A]], 8
+; ALL-NEXT:    [[T32:%.*]] = or i16 [[T21]], [[T5]]
+; ALL-NEXT:    ret i16 [[T32]]
 ;
   %t = zext i16 %a to i32
   %t21 = lshr i32 %t, 9
@@ -483,11 +483,11 @@ define i16 @test40(i16 %a) {
 }
 
 define <2 x i16> @test40vec(<2 x i16> %a) {
-; CHECK-LABEL: @test40vec(
-; CHECK-NEXT:    [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
-; CHECK-NEXT:    [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8>
-; CHECK-NEXT:    [[T32:%.*]] = or <2 x i16> [[T21]], [[T5]]
-; CHECK-NEXT:    ret <2 x i16> [[T32]]
+; ALL-LABEL: @test40vec(
+; ALL-NEXT:    [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 9>
+; ALL-NEXT:    [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 8>
+; ALL-NEXT:    [[T32:%.*]] = or <2 x i16> [[T21]], [[T5]]
+; ALL-NEXT:    ret <2 x i16> [[T32]]
 ;
   %t = zext <2 x i16> %a to <2 x i32>
   %t21 = lshr <2 x i32> %t, <i32 9, i32 9>
@@ -499,8 +499,8 @@ define <2 x i16> @test40vec(<2 x i16> %a) {
 
 ; PR1263
 define i32* @test41(i32* %t1) {
-; CHECK-LABEL: @test41(
-; CHECK-NEXT:    ret i32* [[T1:%.*]]
+; ALL-LABEL: @test41(
+; ALL-NEXT:    ret i32* [[T1:%.*]]
 ;
   %t64 = bitcast i32* %t1 to { i32 }*
   %t65 = getelementptr { i32 }, { i32 }* %t64, i32 0, i32 0
@@ -508,9 +508,9 @@ define i32* @test41(i32* %t1) {
 }
 
 define i32 addrspace(1)* @test41_addrspacecast_smaller(i32* %t1) {
-; CHECK-LABEL: @test41_addrspacecast_smaller(
-; CHECK-NEXT:    [[T65:%.*]] = addrspacecast i32* [[T1:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT:    ret i32 addrspace(1)* [[T65]]
+; ALL-LABEL: @test41_addrspacecast_smaller(
+; ALL-NEXT:    [[T65:%.*]] = addrspacecast i32* [[T1:%.*]] to i32 addrspace(1)*
+; ALL-NEXT:    ret i32 addrspace(1)* [[T65]]
 ;
   %t64 = addrspacecast i32* %t1 to { i32 } addrspace(1)*
   %t65 = getelementptr { i32 }, { i32 } addrspace(1)* %t64, i32 0, i32 0
@@ -518,9 +518,9 @@ define i32 addrspace(1)* @test41_addrspacecast_smaller(i32* %t1) {
 }
 
 define i32* @test41_addrspacecast_larger(i32 addrspace(1)* %t1) {
-; CHECK-LABEL: @test41_addrspacecast_larger(
-; CHECK-NEXT:    [[T65:%.*]] = addrspacecast i32 addrspace(1)* [[T1:%.*]] to i32*
-; CHECK-NEXT:    ret i32* [[T65]]
+; ALL-LABEL: @test41_addrspacecast_larger(
+; ALL-NEXT:    [[T65:%.*]] = addrspacecast i32 addrspace(1)* [[T1:%.*]] to i32*
+; ALL-NEXT:    ret i32* [[T65]]
 ;
   %t64 = addrspacecast i32 addrspace(1)* %t1 to { i32 }*
   %t65 = getelementptr { i32 }, { i32 }* %t64, i32 0, i32 0
@@ -528,9 +528,9 @@ define i32* @test41_addrspacecast_larger(i32 addrspace(1)* %t1) {
 }
 
 define i32 @test42(i32 %X) {
-; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[Z:%.*]] = and i32 [[X:%.*]], 255
-; CHECK-NEXT:    ret i32 [[Z]]
+; ALL-LABEL: @test42(
+; ALL-NEXT:    [[Z:%.*]] = and i32 [[X:%.*]], 255
+; ALL-NEXT:    ret i32 [[Z]]
 ;
   %Y = trunc i32 %X to i8
   %Z = zext i8 %Y to i32
@@ -539,10 +539,10 @@ define i32 @test42(i32 %X) {
 
 ; rdar://6598839
 define zeroext i64 @test43(i8 zeroext %on_off) {
-; CHECK-LABEL: @test43(
-; CHECK-NEXT:    [[A:%.*]] = zext i8 [[ON_OFF:%.*]] to i64
-; CHECK-NEXT:    [[B:%.*]] = add nsw i64 [[A]], -1
-; CHECK-NEXT:    ret i64 [[B]]
+; ALL-LABEL: @test43(
+; ALL-NEXT:    [[A:%.*]] = zext i8 [[ON_OFF:%.*]] to i64
+; ALL-NEXT:    [[B:%.*]] = add nsw i64 [[A]], -1
+; ALL-NEXT:    ret i64 [[B]]
 ;
   %A = zext i8 %on_off to i32
   %B = add i32 %A, -1
@@ -551,10 +551,10 @@ define zeroext i64 @test43(i8 zeroext %on_off) {
 }
 
 define i64 @test44(i8 %T) {
-; CHECK-LABEL: @test44(
-; CHECK-NEXT:    [[A:%.*]] = zext i8 [[T:%.*]] to i64
-; CHECK-NEXT:    [[B:%.*]] = or i64 [[A]], 1234
-; CHECK-NEXT:    ret i64 [[B]]
+; ALL-LABEL: @test44(
+; ALL-NEXT:    [[A:%.*]] = zext i8 [[T:%.*]] to i64
+; ALL-NEXT:    [[B:%.*]] = or i64 [[A]], 1234
+; ALL-NEXT:    ret i64 [[B]]
 ;
   %A = zext i8 %T to i16
   %B = or i16 %A, 1234
@@ -563,11 +563,11 @@ define i64 @test44(i8 %T) {
 }
 
 define i64 @test45(i8 %A, i64 %Q) {
-; CHECK-LABEL: @test45(
-; CHECK-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i64
-; CHECK-NEXT:    [[C:%.*]] = or i64 [[B]], [[Q:%.*]]
-; CHECK-NEXT:    [[E:%.*]] = and i64 [[C]], 4294967295
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test45(
+; ALL-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i64
+; ALL-NEXT:    [[C:%.*]] = or i64 [[B]], [[Q:%.*]]
+; ALL-NEXT:    [[E:%.*]] = and i64 [[C]], 4294967295
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %D = trunc i64 %Q to i32  ;; should be removed
   %B = sext i8 %A to i32
@@ -578,10 +578,10 @@ define i64 @test45(i8 %A, i64 %Q) {
 
 
 define i64 @test46(i64 %A) {
-; CHECK-LABEL: @test46(
-; CHECK-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 8
-; CHECK-NEXT:    [[D:%.*]] = and i64 [[C]], 10752
-; CHECK-NEXT:    ret i64 [[D]]
+; ALL-LABEL: @test46(
+; ALL-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 8
+; ALL-NEXT:    [[D:%.*]] = and i64 [[C]], 10752
+; ALL-NEXT:    ret i64 [[D]]
 ;
   %B = trunc i64 %A to i32
   %C = and i32 %B, 42
@@ -591,12 +591,12 @@ define i64 @test46(i64 %A) {
 }
 
 define <2 x i64> @test46vec(<2 x i64> %A) {
-; CHECK-LABEL: @test46vec(
-; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[C:%.*]] = shl <2 x i32> [[B]], <i32 8, i32 8>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i32> [[C]], <i32 10752, i32 10752>
-; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[D]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[E]]
+; ALL-LABEL: @test46vec(
+; ALL-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; ALL-NEXT:    [[C:%.*]] = shl <2 x i32> [[B]], <i32 8, i32 8>
+; ALL-NEXT:    [[D:%.*]] = and <2 x i32> [[C]], <i32 10752, i32 10752>
+; ALL-NEXT:    [[E:%.*]] = zext <2 x i32> [[D]] to <2 x i64>
+; ALL-NEXT:    ret <2 x i64> [[E]]
 ;
   %B = trunc <2 x i64> %A to <2 x i32>
   %C = and <2 x i32> %B, <i32 42, i32 42>
@@ -606,11 +606,11 @@ define <2 x i64> @test46vec(<2 x i64> %A) {
 }
 
 define i64 @test47(i8 %A) {
-; CHECK-LABEL: @test47(
-; CHECK-NEXT:    [[TMP1:%.*]] = or i8 [[A:%.*]], 42
-; CHECK-NEXT:    [[C:%.*]] = sext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[E:%.*]] = and i64 [[C]], 4294967295
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test47(
+; ALL-NEXT:    [[TMP1:%.*]] = or i8 [[A:%.*]], 42
+; ALL-NEXT:    [[C:%.*]] = sext i8 [[TMP1]] to i64
+; ALL-NEXT:    [[E:%.*]] = and i64 [[C]], 4294967295
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %B = sext i8 %A to i32
   %C = or i32 %B, 42
@@ -619,12 +619,12 @@ define i64 @test47(i8 %A) {
 }
 
 define i64 @test48(i8 %A1, i8 %a2) {
-; CHECK-LABEL: @test48(
-; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[A1:%.*]] to i32
-; CHECK-NEXT:    [[C:%.*]] = shl nuw nsw i32 [[Z2]], 8
-; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], [[Z2]]
-; CHECK-NEXT:    [[E:%.*]] = zext i32 [[D]] to i64
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test48(
+; ALL-NEXT:    [[Z2:%.*]] = zext i8 [[A1:%.*]] to i32
+; ALL-NEXT:    [[C:%.*]] = shl nuw nsw i32 [[Z2]], 8
+; ALL-NEXT:    [[D:%.*]] = or i32 [[C]], [[Z2]]
+; ALL-NEXT:    [[E:%.*]] = zext i32 [[D]] to i64
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %Z1 = zext i8 %a2 to i32
   %Z2 = zext i8 %A1 to i32
@@ -635,11 +635,11 @@ define i64 @test48(i8 %A1, i8 %a2) {
 }
 
 define i64 @test49(i64 %A) {
-; CHECK-LABEL: @test49(
-; CHECK-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 32
-; CHECK-NEXT:    [[SEXT:%.*]] = ashr exact i64 [[C]], 32
-; CHECK-NEXT:    [[D:%.*]] = or i64 [[SEXT]], 1
-; CHECK-NEXT:    ret i64 [[D]]
+; ALL-LABEL: @test49(
+; ALL-NEXT:    [[C:%.*]] = shl i64 [[A:%.*]], 32
+; ALL-NEXT:    [[SEXT:%.*]] = ashr exact i64 [[C]], 32
+; ALL-NEXT:    [[D:%.*]] = or i64 [[SEXT]], 1
+; ALL-NEXT:    ret i64 [[D]]
 ;
   %B = trunc i64 %A to i32
   %C = or i32 %B, 1
@@ -648,12 +648,12 @@ define i64 @test49(i64 %A) {
 }
 
 define i64 @test50(i64 %x) {
-; CHECK-LABEL: @test50(
-; CHECK-NEXT:    [[A:%.*]] = lshr i64 [[X:%.*]], 2
-; CHECK-NEXT:    [[D:%.*]] = shl i64 [[A]], 32
-; CHECK-NEXT:    [[SEXT:%.*]] = add i64 [[D]], -4294967296
-; CHECK-NEXT:    [[E:%.*]] = ashr exact i64 [[SEXT]], 32
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test50(
+; ALL-NEXT:    [[A:%.*]] = lshr i64 [[X:%.*]], 2
+; ALL-NEXT:    [[D:%.*]] = shl i64 [[A]], 32
+; ALL-NEXT:    [[SEXT:%.*]] = add i64 [[D]], -4294967296
+; ALL-NEXT:    [[E:%.*]] = ashr exact i64 [[SEXT]], 32
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %a = lshr i64 %x, 2
   %B = trunc i64 %a to i32
@@ -664,13 +664,13 @@ define i64 @test50(i64 %x) {
 }
 
 define i64 @test51(i64 %A, i1 %cond) {
-; CHECK-LABEL: @test51(
-; CHECK-NEXT:    [[C:%.*]] = and i64 [[A:%.*]], 4294967294
-; CHECK-NEXT:    [[D:%.*]] = or i64 [[A]], 1
-; CHECK-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i64 [[C]], i64 [[D]]
-; CHECK-NEXT:    [[SEXT:%.*]] = shl i64 [[E]], 32
-; CHECK-NEXT:    [[F:%.*]] = ashr exact i64 [[SEXT]], 32
-; CHECK-NEXT:    ret i64 [[F]]
+; ALL-LABEL: @test51(
+; ALL-NEXT:    [[C:%.*]] = and i64 [[A:%.*]], 4294967294
+; ALL-NEXT:    [[D:%.*]] = or i64 [[A]], 1
+; ALL-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i64 [[C]], i64 [[D]]
+; ALL-NEXT:    [[SEXT:%.*]] = shl i64 [[E]], 32
+; ALL-NEXT:    [[F:%.*]] = ashr exact i64 [[SEXT]], 32
+; ALL-NEXT:    ret i64 [[F]]
 ;
   %B = trunc i64 %A to i32
   %C = and i32 %B, -2
@@ -681,11 +681,11 @@ define i64 @test51(i64 %A, i1 %cond) {
 }
 
 define i32 @test52(i64 %A) {
-; CHECK-LABEL: @test52(
-; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
-; CHECK-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
-; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], 32962
-; CHECK-NEXT:    ret i32 [[D]]
+; ALL-LABEL: @test52(
+; ALL-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; ALL-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
+; ALL-NEXT:    [[D:%.*]] = or i32 [[C]], 32962
+; ALL-NEXT:    ret i32 [[D]]
 ;
   %B = trunc i64 %A to i16
   %C = or i16 %B, -32574
@@ -695,11 +695,11 @@ define i32 @test52(i64 %A) {
 }
 
 define i64 @test53(i32 %A) {
-; CHECK-LABEL: @test53(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
-; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 32962
-; CHECK-NEXT:    [[D:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT:    ret i64 [[D]]
+; ALL-LABEL: @test53(
+; ALL-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
+; ALL-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], 32962
+; ALL-NEXT:    [[D:%.*]] = zext i32 [[TMP2]] to i64
+; ALL-NEXT:    ret i64 [[D]]
 ;
   %B = trunc i32 %A to i16
   %C = or i16 %B, -32574
@@ -709,11 +709,11 @@ define i64 @test53(i32 %A) {
 }
 
 define i32 @test54(i64 %A) {
-; CHECK-LABEL: @test54(
-; CHECK-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
-; CHECK-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
-; CHECK-NEXT:    [[D:%.*]] = or i32 [[C]], -32574
-; CHECK-NEXT:    ret i32 [[D]]
+; ALL-LABEL: @test54(
+; ALL-NEXT:    [[B:%.*]] = trunc i64 [[A:%.*]] to i32
+; ALL-NEXT:    [[C:%.*]] = and i32 [[B]], 7224
+; ALL-NEXT:    [[D:%.*]] = or i32 [[C]], -32574
+; ALL-NEXT:    ret i32 [[D]]
 ;
   %B = trunc i64 %A to i16
   %C = or i16 %B, -32574
@@ -723,11 +723,11 @@ define i32 @test54(i64 %A) {
 }
 
 define i64 @test55(i32 %A) {
-; CHECK-LABEL: @test55(
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
-; CHECK-NEXT:    [[C:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[D:%.*]] = or i64 [[C]], -32574
-; CHECK-NEXT:    ret i64 [[D]]
+; ALL-LABEL: @test55(
+; ALL-NEXT:    [[TMP1:%.*]] = and i32 [[A:%.*]], 7224
+; ALL-NEXT:    [[C:%.*]] = zext i32 [[TMP1]] to i64
+; ALL-NEXT:    [[D:%.*]] = or i64 [[C]], -32574
+; ALL-NEXT:    ret i64 [[D]]
 ;
   %B = trunc i32 %A to i16
   %C = or i16 %B, -32574
@@ -737,11 +737,11 @@ define i64 @test55(i32 %A) {
 }
 
 define i64 @test56(i16 %A) {
-; CHECK-LABEL: @test56(
-; CHECK-NEXT:    [[P353:%.*]] = sext i16 [[A:%.*]] to i64
-; CHECK-NEXT:    [[P354:%.*]] = lshr i64 [[P353]], 5
-; CHECK-NEXT:    [[P355:%.*]] = and i64 [[P354]], 134217727
-; CHECK-NEXT:    ret i64 [[P355]]
+; ALL-LABEL: @test56(
+; ALL-NEXT:    [[P353:%.*]] = sext i16 [[A:%.*]] to i64
+; ALL-NEXT:    [[P354:%.*]] = lshr i64 [[P353]], 5
+; ALL-NEXT:    [[P355:%.*]] = and i64 [[P354]], 134217727
+; ALL-NEXT:    ret i64 [[P355]]
 ;
   %p353 = sext i16 %A to i32
   %p354 = lshr i32 %p353, 5
@@ -750,11 +750,11 @@ define i64 @test56(i16 %A) {
 }
 
 define <2 x i64> @test56vec(<2 x i16> %A) {
-; CHECK-LABEL: @test56vec(
-; CHECK-NEXT:    [[P353:%.*]] = sext <2 x i16> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[P354:%.*]] = lshr <2 x i32> [[P353]], <i32 5, i32 5>
-; CHECK-NEXT:    [[P355:%.*]] = zext <2 x i32> [[P354]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[P355]]
+; ALL-LABEL: @test56vec(
+; ALL-NEXT:    [[P353:%.*]] = sext <2 x i16> [[A:%.*]] to <2 x i32>
+; ALL-NEXT:    [[P354:%.*]] = lshr <2 x i32> [[P353]], <i32 5, i32 5>
+; ALL-NEXT:    [[P355:%.*]] = zext <2 x i32> [[P354]] to <2 x i64>
+; ALL-NEXT:    ret <2 x i64> [[P355]]
 ;
   %p353 = sext <2 x i16> %A to <2 x i32>
   %p354 = lshr <2 x i32> %p353, <i32 5, i32 5>
@@ -763,10 +763,10 @@ define <2 x i64> @test56vec(<2 x i16> %A) {
 }
 
 define i64 @test57(i64 %A) {
-; CHECK-LABEL: @test57(
-; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
-; CHECK-NEXT:    [[E:%.*]] = and i64 [[C]], 16777215
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test57(
+; ALL-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
+; ALL-NEXT:    [[E:%.*]] = and i64 [[C]], 16777215
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %B = trunc i64 %A to i32
   %C = lshr i32 %B, 8
@@ -775,11 +775,11 @@ define i64 @test57(i64 %A) {
 }
 
 define <2 x i64> @test57vec(<2 x i64> %A) {
-; CHECK-LABEL: @test57vec(
-; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 8, i32 8>
-; CHECK-NEXT:    [[E:%.*]] = zext <2 x i32> [[C]] to <2 x i64>
-; CHECK-NEXT:    ret <2 x i64> [[E]]
+; ALL-LABEL: @test57vec(
+; ALL-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
+; ALL-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 8, i32 8>
+; ALL-NEXT:    [[E:%.*]] = zext <2 x i32> [[C]] to <2 x i64>
+; ALL-NEXT:    ret <2 x i64> [[E]]
 ;
   %B = trunc <2 x i64> %A to <2 x i32>
   %C = lshr <2 x i32> %B, <i32 8, i32 8>
@@ -788,11 +788,11 @@ define <2 x i64> @test57vec(<2 x i64> %A) {
 }
 
 define i64 @test58(i64 %A) {
-; CHECK-LABEL: @test58(
-; CHECK-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
-; CHECK-NEXT:    [[D:%.*]] = and i64 [[C]], 16777087
-; CHECK-NEXT:    [[E:%.*]] = or i64 [[D]], 128
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test58(
+; ALL-NEXT:    [[C:%.*]] = lshr i64 [[A:%.*]], 8
+; ALL-NEXT:    [[D:%.*]] = and i64 [[C]], 16777087
+; ALL-NEXT:    [[E:%.*]] = or i64 [[D]], 128
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %B = trunc i64 %A to i32
   %C = lshr i32 %B, 8
@@ -803,14 +803,14 @@ define i64 @test58(i64 %A) {
 }
 
 define i64 @test59(i8 %A, i8 %B) {
-; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i64
-; CHECK-NEXT:    [[D:%.*]] = shl nuw nsw i64 [[C]], 4
-; CHECK-NEXT:    [[E:%.*]] = and i64 [[D]], 48
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[B:%.*]], 4
-; CHECK-NEXT:    [[G:%.*]] = zext i8 [[TMP1]] to i64
-; CHECK-NEXT:    [[H:%.*]] = or i64 [[E]], [[G]]
-; CHECK-NEXT:    ret i64 [[H]]
+; ALL-LABEL: @test59(
+; ALL-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i64
+; ALL-NEXT:    [[D:%.*]] = shl nuw nsw i64 [[C]], 4
+; ALL-NEXT:    [[E:%.*]] = and i64 [[D]], 48
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i8 [[B:%.*]], 4
+; ALL-NEXT:    [[G:%.*]] = zext i8 [[TMP1]] to i64
+; ALL-NEXT:    [[H:%.*]] = or i64 [[E]], [[G]]
+; ALL-NEXT:    ret i64 [[H]]
 ;
   %C = zext i8 %A to i32
   %D = shl i32 %C, 4
@@ -823,9 +823,13 @@ define i64 @test59(i8 %A, i8 %B) {
 }
 
 define <3 x i32> @test60(<4 x i32> %call4) {
-; CHECK-LABEL: @test60(
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    ret <3 x i32> [[P10]]
+; BE-LABEL: @test60(
+; BE-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; BE-NEXT:    ret <3 x i32> [[P10]]
+;
+; LE-LABEL: @test60(
+; LE-NEXT:    [[P10:%.*]] = shufflevector <4 x i32> [[CALL4:%.*]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; LE-NEXT:    ret <3 x i32> [[P10]]
 ;
   %p11 = bitcast <4 x i32> %call4 to i128
   %p9 = trunc i128 %p11 to i96
@@ -835,9 +839,13 @@ define <3 x i32> @test60(<4 x i32> %call4) {
 }
 
 define <4 x i32> @test61(<3 x i32> %call4) {
-; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[P10]]
+; BE-LABEL: @test61(
+; BE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; BE-NEXT:    ret <4 x i32> [[P10]]
+;
+; LE-LABEL: @test61(
+; LE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[CALL4:%.*]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; LE-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x i32> %call4 to i96
   %p9 = zext i96 %p11 to i128
@@ -846,10 +854,15 @@ define <4 x i32> @test61(<3 x i32> %call4) {
 }
 
 define <4 x i32> @test62(<3 x float> %call4) {
-; CHECK-LABEL: @test62(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
-; CHECK-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    ret <4 x i32> [[P10]]
+; BE-LABEL: @test62(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
+; BE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+; BE-NEXT:    ret <4 x i32> [[P10]]
+;
+; LE-LABEL: @test62(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <3 x float> [[CALL4:%.*]] to <3 x i32>
+; LE-NEXT:    [[P10:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> <i32 0, i32 undef, i32 undef>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; LE-NEXT:    ret <4 x i32> [[P10]]
 ;
   %p11 = bitcast <3 x float> %call4 to i96
   %p9 = zext i96 %p11 to i128
@@ -859,10 +872,10 @@ define <4 x i32> @test62(<3 x float> %call4) {
 
 ; PR7311 - Don't create invalid IR on scalar->vector cast.
 define <2 x float> @test63(i64 %t8) {
-; CHECK-LABEL: @test63(
-; CHECK-NEXT:    [[A:%.*]] = bitcast i64 [[T8:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[VCVT_I]]
+; ALL-LABEL: @test63(
+; ALL-NEXT:    [[A:%.*]] = bitcast i64 [[T8:%.*]] to <2 x i32>
+; ALL-NEXT:    [[VCVT_I:%.*]] = uitofp <2 x i32> [[A]] to <2 x float>
+; ALL-NEXT:    ret <2 x float> [[VCVT_I]]
 ;
   %a = bitcast i64 %t8 to <2 x i32>
   %vcvt.i = uitofp <2 x i32> %a to <2 x float>
@@ -870,8 +883,8 @@ define <2 x float> @test63(i64 %t8) {
 }
 
 define <4 x float> @test64(<4 x float> %c) {
-; CHECK-LABEL: @test64(
-; CHECK-NEXT:    ret <4 x float> [[C:%.*]]
+; ALL-LABEL: @test64(
+; ALL-NEXT:    ret <4 x float> [[C:%.*]]
 ;
   %t0 = bitcast <4 x float> %c to <4 x i32>
   %t1 = bitcast <4 x i32> %t0 to <4 x float>
@@ -879,8 +892,8 @@ define <4 x float> @test64(<4 x float> %c) {
 }
 
 define <4 x float> @test65(<4 x float> %c) {
-; CHECK-LABEL: @test65(
-; CHECK-NEXT:    ret <4 x float> [[C:%.*]]
+; ALL-LABEL: @test65(
+; ALL-NEXT:    ret <4 x float> [[C:%.*]]
 ;
   %t0 = bitcast <4 x float> %c to <2 x double>
   %t1 = bitcast <2 x double> %t0 to <4 x float>
@@ -888,8 +901,8 @@ define <4 x float> @test65(<4 x float> %c) {
 }
 
 define <2 x float> @test66(<2 x float> %c) {
-; CHECK-LABEL: @test66(
-; CHECK-NEXT:    ret <2 x float> [[C:%.*]]
+; ALL-LABEL: @test66(
+; ALL-NEXT:    ret <2 x float> [[C:%.*]]
 ;
   %t0 = bitcast <2 x float> %c to double
   %t1 = bitcast double %t0 to <2 x float>
@@ -897,16 +910,16 @@ define <2 x float> @test66(<2 x float> %c) {
 }
 
 define float @test2c() {
-; CHECK-LABEL: @test2c(
-; CHECK-NEXT:    ret float -1.000000e+00
+; ALL-LABEL: @test2c(
+; ALL-NEXT:    ret float -1.000000e+00
 ;
   ret float extractelement (<2 x float> bitcast (double bitcast (<2 x float> <float -1.000000e+00, float -1.000000e+00> to double) to <2 x float>), i32 0)
 }
 
 define i64 @test_mmx(<2 x i32> %x) {
-; CHECK-LABEL: @test_mmx(
-; CHECK-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
-; CHECK-NEXT:    ret i64 [[C]]
+; ALL-LABEL: @test_mmx(
+; ALL-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[X:%.*]] to i64
+; ALL-NEXT:    ret i64 [[C]]
 ;
   %A = bitcast <2 x i32> %x to x86_mmx
   %B = bitcast x86_mmx %A to <2 x i32>
@@ -915,8 +928,8 @@ define i64 @test_mmx(<2 x i32> %x) {
 }
 
 define i64 @test_mmx_const(<2 x i32> %c) {
-; CHECK-LABEL: @test_mmx_const(
-; CHECK-NEXT:    ret i64 0
+; ALL-LABEL: @test_mmx_const(
+; ALL-NEXT:    ret i64 0
 ;
   %A = bitcast <2 x i32> zeroinitializer to x86_mmx
   %B = bitcast x86_mmx %A to <2 x i32>
@@ -926,8 +939,8 @@ define i64 @test_mmx_const(<2 x i32> %c) {
 
 ; PR12514
 define i1 @test67(i1 %a, i32 %b) {
-; CHECK-LABEL: @test67(
-; CHECK-NEXT:    ret i1 false
+; ALL-LABEL: @test67(
+; ALL-NEXT:    ret i1 false
 ;
   %t2 = zext i1 %a to i32
   %conv6 = xor i32 %t2, 1
@@ -943,10 +956,10 @@ define i1 @test67(i1 %a, i32 %b) {
 %s = type { i32, i32, i16 }
 
 define %s @test68(%s *%p, i64 %i) {
-; CHECK-LABEL: @test68(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test68(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = bitcast %s* %p to i8*
@@ -958,10 +971,10 @@ define %s @test68(%s *%p, i64 %i) {
 
 ; addrspacecasts should be eliminated.
 define %s @test68_addrspacecast(%s* %p, i64 %i) {
-; CHECK-LABEL: @test68_addrspacecast(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test68_addrspacecast(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = addrspacecast %s* %p to i8 addrspace(2)*
@@ -972,11 +985,11 @@ define %s @test68_addrspacecast(%s* %p, i64 %i) {
 }
 
 define %s @test68_addrspacecast_2(%s* %p, i64 %i) {
-; CHECK-LABEL: @test68_addrspacecast_2(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = addrspacecast %s* [[PP1]] to [[S]] addrspace(1)*
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[R]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test68_addrspacecast_2(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[R:%.*]] = addrspacecast %s* [[PP1]] to [[S]] addrspace(1)*
+; ALL-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[R]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %q = addrspacecast %s* %p to i8 addrspace(2)*
@@ -987,10 +1000,10 @@ define %s @test68_addrspacecast_2(%s* %p, i64 %i) {
 }
 
 define %s @test68_as1(%s addrspace(1)* %p, i32 %i) {
-; CHECK-LABEL: @test68_as1(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], [[S]] addrspace(1)* [[P:%.*]], i32 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test68_as1(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], [[S]] addrspace(1)* [[P:%.*]], i32 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], [[S]] addrspace(1)* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul i32 %i, 12
   %q = bitcast %s addrspace(1)* %p to i8 addrspace(1)*
@@ -1001,10 +1014,10 @@ define %s @test68_as1(%s addrspace(1)* %p, i32 %i) {
 }
 
 define double @test69(double *%p, i64 %i) {
-; CHECK-LABEL: @test69(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test69(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %o = shl nsw i64 %i, 3
   %q = bitcast double* %p to i8*
@@ -1015,11 +1028,11 @@ define double @test69(double *%p, i64 %i) {
 }
 
 define %s @test70(%s *%p, i64 %i) {
-; CHECK-LABEL: @test70(
-; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test70(
+; ALL-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul nsw i64 %i, 36
   %q = bitcast %s* %p to i8*
@@ -1030,11 +1043,11 @@ define %s @test70(%s *%p, i64 %i) {
 }
 
 define double @test71(double *%p, i64 %i) {
-; CHECK-LABEL: @test71(
-; CHECK-NEXT:    [[O:%.*]] = shl i64 [[I:%.*]], 2
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[O]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test71(
+; ALL-NEXT:    [[O:%.*]] = shl i64 [[I:%.*]], 2
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[O]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %o = shl i64 %i, 5
   %q = bitcast double* %p to i8*
@@ -1045,11 +1058,11 @@ define double @test71(double *%p, i64 %i) {
 }
 
 define double @test72(double *%p, i32 %i) {
-; CHECK-LABEL: @test72(
-; CHECK-NEXT:    [[O:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[O]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test72(
+; ALL-NEXT:    [[O:%.*]] = sext i32 [[I:%.*]] to i64
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[O]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %so = shl nsw i32 %i, 3
   %o = sext i32 %so to i64
@@ -1061,11 +1074,11 @@ define double @test72(double *%p, i32 %i) {
 }
 
 define double @test73(double *%p, i128 %i) {
-; CHECK-LABEL: @test73(
-; CHECK-NEXT:    [[I_TR:%.*]] = trunc i128 [[I:%.*]] to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[I_TR]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test73(
+; ALL-NEXT:    [[I_TR:%.*]] = trunc i128 [[I:%.*]] to i64
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr double, double* [[P:%.*]], i64 [[I_TR]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %lo = shl nsw i128 %i, 3
   %o = trunc i128 %lo to i64
@@ -1077,10 +1090,10 @@ define double @test73(double *%p, i128 %i) {
 }
 
 define double @test74(double *%p, i64 %i) {
-; CHECK-LABEL: @test74(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test74(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %q = bitcast double* %p to i64*
   %pp = getelementptr inbounds i64, i64* %q, i64 %i
@@ -1090,13 +1103,13 @@ define double @test74(double *%p, i64 %i) {
 }
 
 define i32* @test75(i32* %p, i32 %x) {
-; CHECK-LABEL: @test75(
-; CHECK-NEXT:    [[Y:%.*]] = shl i32 [[X:%.*]], 3
-; CHECK-NEXT:    [[Z:%.*]] = sext i32 [[Y]] to i64
-; CHECK-NEXT:    [[Q:%.*]] = bitcast i32* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[R:%.*]] = getelementptr i8, i8* [[Q]], i64 [[Z]]
-; CHECK-NEXT:    [[S:%.*]] = bitcast i8* [[R]] to i32*
-; CHECK-NEXT:    ret i32* [[S]]
+; ALL-LABEL: @test75(
+; ALL-NEXT:    [[Y:%.*]] = shl i32 [[X:%.*]], 3
+; ALL-NEXT:    [[Z:%.*]] = sext i32 [[Y]] to i64
+; ALL-NEXT:    [[Q:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; ALL-NEXT:    [[R:%.*]] = getelementptr i8, i8* [[Q]], i64 [[Z]]
+; ALL-NEXT:    [[S:%.*]] = bitcast i8* [[R]] to i32*
+; ALL-NEXT:    ret i32* [[S]]
 ;
   %y = shl i32 %x, 3
   %z = sext i32 %y to i64
@@ -1107,11 +1120,11 @@ define i32* @test75(i32* %p, i32 %x) {
 }
 
 define %s @test76(%s *%p, i64 %i, i64 %j) {
-; CHECK-LABEL: @test76(
-; CHECK-NEXT:    [[O2:%.*]] = mul i64 [[I:%.*]], [[J:%.*]]
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test76(
+; ALL-NEXT:    [[O2:%.*]] = mul i64 [[I:%.*]], [[J:%.*]]
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul i64 %i, 12
   %o2 = mul nsw i64 %o, %j
@@ -1123,12 +1136,12 @@ define %s @test76(%s *%p, i64 %i, i64 %j) {
 }
 
 define %s @test77(%s *%p, i64 %i, i64 %j) {
-; CHECK-LABEL: @test77(
-; CHECK-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
-; CHECK-NEXT:    [[O2:%.*]] = mul nsw i64 [[O]], [[J:%.*]]
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
-; CHECK-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test77(
+; ALL-NEXT:    [[O:%.*]] = mul nsw i64 [[I:%.*]], 3
+; ALL-NEXT:    [[O2:%.*]] = mul nsw i64 [[O]], [[J:%.*]]
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr inbounds [[S:%.*]], %s* [[P:%.*]], i64 [[O2]]
+; ALL-NEXT:    [[L:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %o = mul nsw i64 %i, 36
   %o2 = mul nsw i64 %o, %j
@@ -1140,18 +1153,18 @@ define %s @test77(%s *%p, i64 %i, i64 %j) {
 }
 
 define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
-; CHECK-LABEL: @test78(
-; CHECK-NEXT:    [[A:%.*]] = mul nsw i32 [[K:%.*]], 3
-; CHECK-NEXT:    [[B:%.*]] = mul nsw i32 [[A]], [[L:%.*]]
-; CHECK-NEXT:    [[C:%.*]] = sext i32 [[B]] to i128
-; CHECK-NEXT:    [[D:%.*]] = mul nsw i128 [[C]], [[M:%.*]]
-; CHECK-NEXT:    [[E:%.*]] = mul i128 [[D]], [[N:%.*]]
-; CHECK-NEXT:    [[F:%.*]] = trunc i128 [[E]] to i64
-; CHECK-NEXT:    [[G:%.*]] = mul i64 [[F]], [[I:%.*]]
-; CHECK-NEXT:    [[H:%.*]] = mul i64 [[G]], [[J:%.*]]
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[H]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load [[S]], %s* [[PP1]], align 4
-; CHECK-NEXT:    ret [[S]] %load
+; ALL-LABEL: @test78(
+; ALL-NEXT:    [[A:%.*]] = mul nsw i32 [[K:%.*]], 3
+; ALL-NEXT:    [[B:%.*]] = mul nsw i32 [[A]], [[L:%.*]]
+; ALL-NEXT:    [[C:%.*]] = sext i32 [[B]] to i128
+; ALL-NEXT:    [[D:%.*]] = mul nsw i128 [[C]], [[M:%.*]]
+; ALL-NEXT:    [[E:%.*]] = mul i128 [[D]], [[N:%.*]]
+; ALL-NEXT:    [[F:%.*]] = trunc i128 [[E]] to i64
+; ALL-NEXT:    [[G:%.*]] = mul i64 [[F]], [[I:%.*]]
+; ALL-NEXT:    [[H:%.*]] = mul i64 [[G]], [[J:%.*]]
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [[S:%.*]], %s* [[P:%.*]], i64 [[H]]
+; ALL-NEXT:    [[LOAD:%.*]] = load [[S]], %s* [[PP1]], align 4
+; ALL-NEXT:    ret [[S]] %load
 ;
   %a = mul nsw i32 %k, 36
   %b = mul nsw i32 %a, %l
@@ -1169,16 +1182,16 @@ define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
 }
 
 define %s @test79(%s *%p, i64 %i, i32 %j) {
-; CHECK-LABEL: @test79(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32
-; CHECK-NEXT:    [[B:%.*]] = mul i32 [[TMP1]], 36
-; CHECK-NEXT:    [[C:%.*]] = mul i32 [[B]], [[J:%.*]]
-; CHECK-NEXT:    [[Q:%.*]] = bitcast %s* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[C]] to i64
-; CHECK-NEXT:    [[PP:%.*]] = getelementptr inbounds i8, i8* [[Q]], i64 [[TMP2]]
-; CHECK-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to %s*
-; CHECK-NEXT:    [[L:%.*]] = load [[S:%.*]], %s* [[R]], align 4
-; CHECK-NEXT:    ret [[S]] %l
+; ALL-LABEL: @test79(
+; ALL-NEXT:    [[TMP1:%.*]] = trunc i64 [[I:%.*]] to i32
+; ALL-NEXT:    [[B:%.*]] = mul i32 [[TMP1]], 36
+; ALL-NEXT:    [[C:%.*]] = mul i32 [[B]], [[J:%.*]]
+; ALL-NEXT:    [[Q:%.*]] = bitcast %s* [[P:%.*]] to i8*
+; ALL-NEXT:    [[TMP2:%.*]] = sext i32 [[C]] to i64
+; ALL-NEXT:    [[PP:%.*]] = getelementptr inbounds i8, i8* [[Q]], i64 [[TMP2]]
+; ALL-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to %s*
+; ALL-NEXT:    [[L:%.*]] = load [[S:%.*]], %s* [[R]], align 4
+; ALL-NEXT:    ret [[S]] %l
 ;
   %a = mul nsw i64 %i, 36
   %b = trunc i64 %a to i32
@@ -1191,11 +1204,11 @@ define %s @test79(%s *%p, i64 %i, i32 %j) {
 }
 
 define double @test80([100 x double]* %p, i32 %i) {
-; CHECK-LABEL: @test80(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double]* [[P:%.*]], i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test80(
+; ALL-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double]* [[P:%.*]], i64 0, i64 [[TMP1]]
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %t = shl nsw i32 %i, 3
   %q = bitcast [100 x double]* %p to i8*
@@ -1206,10 +1219,10 @@ define double @test80([100 x double]* %p, i32 %i) {
 }
 
 define double @test80_addrspacecast([100 x double] addrspace(1)* %p, i32 %i) {
-; CHECK-LABEL: @test80_addrspacecast(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test80_addrspacecast(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
+; ALL-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %t = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
@@ -1220,11 +1233,11 @@ define double @test80_addrspacecast([100 x double] addrspace(1)* %p, i32 %i) {
 }
 
 define double @test80_addrspacecast_2([100 x double] addrspace(1)* %p, i32 %i) {
-; CHECK-LABEL: @test80_addrspacecast_2(
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = addrspacecast double addrspace(1)* [[PP1]] to double addrspace(3)*
-; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(3)* [[R]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test80_addrspacecast_2(
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[I:%.*]]
+; ALL-NEXT:    [[R:%.*]] = addrspacecast double addrspace(1)* [[PP1]] to double addrspace(3)*
+; ALL-NEXT:    [[L:%.*]] = load double, double addrspace(3)* [[R]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %t = shl nsw i32 %i, 3
   %q = addrspacecast [100 x double] addrspace(1)* %p to i8 addrspace(2)*
@@ -1235,11 +1248,11 @@ define double @test80_addrspacecast_2([100 x double] addrspace(1)* %p, i32 %i) {
 }
 
 define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
-; CHECK-LABEL: @test80_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i16 [[I:%.*]] to i32
-; CHECK-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[TMP1]]
-; CHECK-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test80_as1(
+; ALL-NEXT:    [[TMP1:%.*]] = sext i16 [[I:%.*]] to i32
+; ALL-NEXT:    [[PP1:%.*]] = getelementptr [100 x double], [100 x double] addrspace(1)* [[P:%.*]], i32 0, i32 [[TMP1]]
+; ALL-NEXT:    [[L:%.*]] = load double, double addrspace(1)* [[PP1]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %t = shl nsw i16 %i, 3
   %q = bitcast [100 x double] addrspace(1)* %p to i8 addrspace(1)*
@@ -1250,13 +1263,13 @@ define double @test80_as1([100 x double] addrspace(1)* %p, i16 %i) {
 }
 
 define double @test81(double *%p, float %f) {
-; CHECK-LABEL: @test81(
-; CHECK-NEXT:    [[I:%.*]] = fptosi float [[F:%.*]] to i64
-; CHECK-NEXT:    [[Q:%.*]] = bitcast double* [[P:%.*]] to i8*
-; CHECK-NEXT:    [[PP:%.*]] = getelementptr i8, i8* [[Q]], i64 [[I]]
-; CHECK-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to double*
-; CHECK-NEXT:    [[L:%.*]] = load double, double* [[R]], align 8
-; CHECK-NEXT:    ret double [[L]]
+; ALL-LABEL: @test81(
+; ALL-NEXT:    [[I:%.*]] = fptosi float [[F:%.*]] to i64
+; ALL-NEXT:    [[Q:%.*]] = bitcast double* [[P:%.*]] to i8*
+; ALL-NEXT:    [[PP:%.*]] = getelementptr i8, i8* [[Q]], i64 [[I]]
+; ALL-NEXT:    [[R:%.*]] = bitcast i8* [[PP]] to double*
+; ALL-NEXT:    [[L:%.*]] = load double, double* [[R]], align 8
+; ALL-NEXT:    ret double [[L]]
 ;
   %i = fptosi float %f to i64
   %q = bitcast double* %p to i8*
@@ -1267,10 +1280,10 @@ define double @test81(double *%p, float %f) {
 }
 
 define i64 @test82(i64 %A) {
-; CHECK-LABEL: @test82(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[A:%.*]], 1
-; CHECK-NEXT:    [[E:%.*]] = and i64 [[TMP1]], 4294966784
-; CHECK-NEXT:    ret i64 [[E]]
+; ALL-LABEL: @test82(
+; ALL-NEXT:    [[TMP1:%.*]] = shl i64 [[A:%.*]], 1
+; ALL-NEXT:    [[E:%.*]] = and i64 [[TMP1]], 4294966784
+; ALL-NEXT:    ret i64 [[E]]
 ;
   %B = trunc i64 %A to i32
   %C = lshr i32 %B, 8
@@ -1281,13 +1294,13 @@ define i64 @test82(i64 %A) {
 
 ; PR15959
 define i64 @test83(i16 %a, i64 %k) {
-; CHECK-LABEL: @test83(
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[K:%.*]] to i32
-; CHECK-NEXT:    [[SH_PROM:%.*]] = add i32 [[TMP1]], -1
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[SH_PROM]]
-; CHECK-NEXT:    [[SH_PROM1:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT:    ret i64 [[SH_PROM1]]
+; ALL-LABEL: @test83(
+; ALL-NEXT:    [[CONV:%.*]] = sext i16 [[A:%.*]] to i32
+; ALL-NEXT:    [[TMP1:%.*]] = trunc i64 [[K:%.*]] to i32
+; ALL-NEXT:    [[SH_PROM:%.*]] = add i32 [[TMP1]], -1
+; ALL-NEXT:    [[SHL:%.*]] = shl i32 [[CONV]], [[SH_PROM]]
+; ALL-NEXT:    [[SH_PROM1:%.*]] = zext i32 [[SHL]] to i64
+; ALL-NEXT:    ret i64 [[SH_PROM1]]
 ;
   %conv = sext i16 %a to i32
   %sub = add nsw i64 %k, -1
@@ -1298,11 +1311,11 @@ define i64 @test83(i16 %a, i64 %k) {
 }
 
 define i8 @test84(i32 %a) {
-; CHECK-LABEL: @test84(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
-; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
-; CHECK-NEXT:    ret i8 [[TRUNC]]
+; ALL-LABEL: @test84(
+; ALL-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
+; ALL-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
+; ALL-NEXT:    ret i8 [[TRUNC]]
 ;
   %add = add nsw i32 %a, -16777216
   %shr = lshr exact i32 %add, 23
@@ -1311,11 +1324,11 @@ define i8 @test84(i32 %a) {
 }
 
 define i8 @test85(i32 %a) {
-; CHECK-LABEL: @test85(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
-; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
-; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
-; CHECK-NEXT:    ret i8 [[TRUNC]]
+; ALL-LABEL: @test85(
+; ALL-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
+; ALL-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; ALL-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
+; ALL-NEXT:    ret i8 [[TRUNC]]
 ;
   %add = add nuw i32 %a, -16777216
   %shr = lshr exact i32 %add, 23
@@ -1324,9 +1337,9 @@ define i8 @test85(i32 %a) {
 }
 
 define i16 @test86(i16 %v) {
-; CHECK-LABEL: @test86(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 4
-; CHECK-NEXT:    ret i16 [[TMP1]]
+; ALL-LABEL: @test86(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 4
+; ALL-NEXT:    ret i16 [[TMP1]]
 ;
   %a = sext i16 %v to i32
   %s = ashr i32 %a, 4
@@ -1335,9 +1348,9 @@ define i16 @test86(i16 %v) {
 }
 
 define i16 @test87(i16 %v) {
-; CHECK-LABEL: @test87(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 12
-; CHECK-NEXT:    ret i16 [[TMP1]]
+; ALL-LABEL: @test87(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 12
+; ALL-NEXT:    ret i16 [[TMP1]]
 ;
   %c = sext i16 %v to i32
   %m = mul nsw i32 %c, 16
@@ -1347,9 +1360,9 @@ define i16 @test87(i16 %v) {
 }
 
 define i16 @test88(i16 %v) {
-; CHECK-LABEL: @test88(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 15
-; CHECK-NEXT:    ret i16 [[TMP1]]
+; ALL-LABEL: @test88(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i16 [[V:%.*]], 15
+; ALL-NEXT:    ret i16 [[TMP1]]
 ;
   %a = sext i16 %v to i32
   %s = ashr i32 %a, 18
@@ -1358,10 +1371,10 @@ define i16 @test88(i16 %v) {
 }
 
 define i32 @PR21388(i32* %v) {
-; CHECK-LABEL: @PR21388(
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32* [[V:%.*]], null
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
-; CHECK-NEXT:    ret i32 [[SEXT]]
+; ALL-LABEL: @PR21388(
+; ALL-NEXT:    [[ICMP:%.*]] = icmp slt i32* [[V:%.*]], null
+; ALL-NEXT:    [[SEXT:%.*]] = sext i1 [[ICMP]] to i32
+; ALL-NEXT:    ret i32 [[SEXT]]
 ;
   %icmp = icmp slt i32* %v, null
   %sext = sext i1 %icmp to i32
@@ -1369,9 +1382,9 @@ define i32 @PR21388(i32* %v) {
 }
 
 define float @sitofp_zext(i16 %a) {
-; CHECK-LABEL: @sitofp_zext(
-; CHECK-NEXT:    [[SITOFP:%.*]] = uitofp i16 [[A:%.*]] to float
-; CHECK-NEXT:    ret float [[SITOFP]]
+; ALL-LABEL: @sitofp_zext(
+; ALL-NEXT:    [[SITOFP:%.*]] = uitofp i16 [[A:%.*]] to float
+; ALL-NEXT:    ret float [[SITOFP]]
 ;
   %zext = zext i16 %a to i32
   %sitofp = sitofp i32 %zext to float
@@ -1379,11 +1392,11 @@ define float @sitofp_zext(i16 %a) {
 }
 
 define i1 @PR23309(i32 %A, i32 %B) {
-; CHECK-LABEL: @PR23309(
-; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
-; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    ret i1 [[TRUNC]]
+; ALL-LABEL: @PR23309(
+; ALL-NEXT:    [[SUB:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
+; ALL-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
+; ALL-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    ret i1 [[TRUNC]]
 ;
   %add = add i32 %A, -4
   %sub = sub nsw i32 %add, %B
@@ -1392,11 +1405,11 @@ define i1 @PR23309(i32 %A, i32 %B) {
 }
 
 define i1 @PR23309v2(i32 %A, i32 %B) {
-; CHECK-LABEL: @PR23309v2(
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
-; CHECK-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    ret i1 [[TRUNC]]
+; ALL-LABEL: @PR23309v2(
+; ALL-NEXT:    [[SUB:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
+; ALL-NEXT:    [[TMP1:%.*]] = and i32 [[SUB]], 1
+; ALL-NEXT:    [[TRUNC:%.*]] = icmp ne i32 [[TMP1]], 0
+; ALL-NEXT:    ret i1 [[TRUNC]]
 ;
   %add = add i32 %A, -4
   %sub = add nuw i32 %add, %B
@@ -1405,10 +1418,10 @@ define i1 @PR23309v2(i32 %A, i32 %B) {
 }
 
 define i16 @PR24763(i8 %V) {
-; CHECK-LABEL: @PR24763(
-; CHECK-NEXT:    [[L:%.*]] = ashr i8 [[V:%.*]], 1
-; CHECK-NEXT:    [[T:%.*]] = sext i8 [[L]] to i16
-; CHECK-NEXT:    ret i16 [[T]]
+; ALL-LABEL: @PR24763(
+; ALL-NEXT:    [[L:%.*]] = ashr i8 [[V:%.*]], 1
+; ALL-NEXT:    [[T:%.*]] = sext i8 [[L]] to i16
+; ALL-NEXT:    ret i16 [[T]]
 ;
   %conv = sext i8 %V to i32
   %l = lshr i32 %conv, 1
@@ -1417,23 +1430,32 @@ define i16 @PR24763(i8 %V) {
 }
 
 define i64 @PR28745() {
-; CHECK-LABEL: @PR28745(
-; CHECK-NEXT:    ret i64 1
+; BE-LABEL: @PR28745(
+; BE-NEXT:    ret i64 1
+;
+; LE-LABEL: @PR28745(
+; LE-NEXT:    ret i64 0
 ;
   %b = zext i32 extractvalue ({ i32 } select (i1 icmp eq (i16 extractelement (<2 x i16> bitcast (<1 x i32> <i32 1> to <2 x i16>), i32 0), i16 0), { i32 } { i32 1 }, { i32 } zeroinitializer), 0) to i64
   ret i64 %b
 }
 
 define i32 @test89() {
-; CHECK-LABEL: @test89(
-; CHECK-NEXT:    ret i32 393216
+; BE-LABEL: @test89(
+; BE-NEXT:    ret i32 393216
+;
+; LE-LABEL: @test89(
+; LE-NEXT:    ret i32 6
 ;
   ret i32 bitcast (<2 x i16> <i16 6, i16 undef> to i32)
 }
 
 define <2 x i32> @test90() {
-; CHECK-LABEL: @test90(
-; CHECK-NEXT:    ret <2 x i32> <i32 0, i32 15360>
+; BE-LABEL: @test90(
+; BE-NEXT:    ret <2 x i32> <i32 0, i32 15360>
+;
+; LE-LABEL: @test90(
+; LE-NEXT:    ret <2 x i32> <i32 0, i32 1006632960>
 ;
   %t6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
   ret <2 x i32> %t6
@@ -1441,11 +1463,11 @@ define <2 x i32> @test90() {
 
 ; Do not optimize to ashr i64 (shift by 48 > 96 - 64)
 define i64 @test91(i64 %A) {
-; CHECK-LABEL: @test91(
-; CHECK-NEXT:    [[B:%.*]] = sext i64 [[A:%.*]] to i96
-; CHECK-NEXT:    [[C:%.*]] = lshr i96 [[B]], 48
-; CHECK-NEXT:    [[D:%.*]] = trunc i96 [[C]] to i64
-; CHECK-NEXT:    ret i64 [[D]]
+; ALL-LABEL: @test91(
+; ALL-NEXT:    [[B:%.*]] = sext i64 [[A:%.*]] to i96
+; ALL-NEXT:    [[C:%.*]] = lshr i96 [[B]], 48
+; ALL-NEXT:    [[D:%.*]] = trunc i96 [[C]] to i64
+; ALL-NEXT:    ret i64 [[D]]
 ;
   %B = sext i64 %A to i96
   %C = lshr i96 %B, 48
@@ -1455,9 +1477,9 @@ define i64 @test91(i64 %A) {
 
 ; Do optimize to ashr i64 (shift by 32 <= 96 - 64)
 define i64 @test92(i64 %A) {
-; CHECK-LABEL: @test92(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i64 [[A:%.*]], 32
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; ALL-LABEL: @test92(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i64 [[A:%.*]], 32
+; ALL-NEXT:    ret i64 [[TMP1]]
 ;
   %B = sext i64 %A to i96
   %C = lshr i96 %B, 32
@@ -1467,9 +1489,9 @@ define i64 @test92(i64 %A) {
 
 ; When optimizing to ashr i32, don't shift by more than 31.
 define i32 @test93(i32 %A) {
-; CHECK-LABEL: @test93(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
-; CHECK-NEXT:    ret i32 [[TMP1]]
+; ALL-LABEL: @test93(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i32 [[A:%.*]], 31
+; ALL-NEXT:    ret i32 [[TMP1]]
 ;
   %B = sext i32 %A to i96
   %C = lshr i96 %B, 64
@@ -1481,9 +1503,9 @@ define i32 @test93(i32 %A) {
 ; PR33078
 
 define i8 @pr33078_1(i8 %A) {
-; CHECK-LABEL: @pr33078_1(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr i8 [[A:%.*]], 7
-; CHECK-NEXT:    ret i8 [[TMP1]]
+; ALL-LABEL: @pr33078_1(
+; ALL-NEXT:    [[TMP1:%.*]] = ashr i8 [[A:%.*]], 7
+; ALL-NEXT:    ret i8 [[TMP1]]
 ;
   %B = sext i8 %A to i16
   %C = lshr i16 %B, 8
@@ -1492,10 +1514,10 @@ define i8 @pr33078_1(i8 %A) {
 }
 
 define i12 @pr33078_2(i8 %A) {
-; CHECK-LABEL: @pr33078_2(
-; CHECK-NEXT:    [[C:%.*]] = ashr i8 [[A:%.*]], 4
-; CHECK-NEXT:    [[D:%.*]] = sext i8 [[C]] to i12
-; CHECK-NEXT:    ret i12 [[D]]
+; ALL-LABEL: @pr33078_2(
+; ALL-NEXT:    [[C:%.*]] = ashr i8 [[A:%.*]], 4
+; ALL-NEXT:    [[D:%.*]] = sext i8 [[C]] to i12
+; ALL-NEXT:    ret i12 [[D]]
 ;
   %B = sext i8 %A to i16
   %C = lshr i16 %B, 4
@@ -1504,11 +1526,11 @@ define i12 @pr33078_2(i8 %A) {
 }
 
 define i4 @pr33078_3(i8 %A) {
-; CHECK-LABEL: @pr33078_3(
-; CHECK-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[B]], 12
-; CHECK-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i4
-; CHECK-NEXT:    ret i4 [[D]]
+; ALL-LABEL: @pr33078_3(
+; ALL-NEXT:    [[B:%.*]] = sext i8 [[A:%.*]] to i16
+; ALL-NEXT:    [[C:%.*]] = lshr i16 [[B]], 12
+; ALL-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i4
+; ALL-NEXT:    ret i4 [[D]]
 ;
   %B = sext i8 %A to i16
   %C = lshr i16 %B, 12
@@ -1518,11 +1540,11 @@ define i4 @pr33078_3(i8 %A) {
 
 define i8 @pr33078_4(i3 %x) {
 ; Don't turn this in an `ashr`. This was getting miscompiled
-; CHECK-LABEL: @pr33078_4(
-; CHECK-NEXT:    [[B:%.*]] = sext i3 [[X:%.*]] to i16
-; CHECK-NEXT:    [[C:%.*]] = lshr i16 [[B]], 13
-; CHECK-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i8
-; CHECK-NEXT:    ret i8 [[D]]
+; ALL-LABEL: @pr33078_4(
+; ALL-NEXT:    [[B:%.*]] = sext i3 [[X:%.*]] to i16
+; ALL-NEXT:    [[C:%.*]] = lshr i16 [[B]], 13
+; ALL-NEXT:    [[D:%.*]] = trunc i16 [[C]] to i8
+; ALL-NEXT:    ret i8 [[D]]
 ;
   %B = sext i3 %x to i16
   %C = lshr i16 %B, 13
@@ -1532,10 +1554,10 @@ define i8 @pr33078_4(i3 %x) {
 
 ; (sext (xor (cmp), -1)) -> (sext (!cmp))
 define i64 @test94(i32 %a) {
-; CHECK-LABEL: @test94(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], -2
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i64
-; CHECK-NEXT:    ret i64 [[TMP2]]
+; ALL-LABEL: @test94(
+; ALL-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[A:%.*]], -2
+; ALL-NEXT:    [[TMP2:%.*]] = sext i1 [[TMP1]] to i64
+; ALL-NEXT:    ret i64 [[TMP2]]
 ;
   %1 = icmp eq i32 %a, -2
   %2 = sext i1 %1 to i8
@@ -1546,11 +1568,11 @@ define i64 @test94(i32 %a) {
 
 ; We should be able to remove the zext and trunc here.
 define i32 @test95(i32 %x) {
-; CHECK-LABEL: @test95(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 6
-; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 40
-; CHECK-NEXT:    ret i32 [[TMP3]]
+; ALL-LABEL: @test95(
+; ALL-NEXT:    [[TMP1:%.*]] = lshr i32 [[X:%.*]], 6
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 2
+; ALL-NEXT:    [[TMP3:%.*]] = or i32 [[TMP2]], 40
+; ALL-NEXT:    ret i32 [[TMP3]]
 ;
   %1 = trunc i32 %x to i8
   %2 = lshr i8 %1, 6
diff --git a/llvm/test/Transforms/InstCombine/copysign.ll b/llvm/test/Transforms/InstCombine/copysign.ll
index 556b79999b024..49ca1b87d0de9 100644
--- a/llvm/test/Transforms/InstCombine/copysign.ll
+++ b/llvm/test/Transforms/InstCombine/copysign.ll
@@ -1,49 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-declare float @llvm.copysign.f32(float, float) #0
-declare double @llvm.copysign.f64(double, double) #0
+declare float @llvm.copysign.f32(float, float)
+declare <3 x double> @llvm.copysign.v3f64(<3 x double>, <3 x double>)
 
-; CHECK-LABEL: @constant_fold_copysign_f32_01
-; CHECK-NEXT: ret float -1.000000e+00
-define float @constant_fold_copysign_f32_01() #0 {
-  %x = call float @llvm.copysign.f32(float 1.0, float -2.0) #0
-  ret float %x
+define float @positive_sign_arg(float %x) {
+; CHECK-LABEL: @positive_sign_arg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call arcp float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %r = call arcp float @llvm.copysign.f32(float %x, float 0.0)
+  ret float %r
 }
 
-; CHECK-LABEL: @constant_fold_copysign_f32_02
-; CHECK-NEXT: ret float 2.000000e+00
-define float @constant_fold_copysign_f32_02() #0 {
-  %x = call float @llvm.copysign.f32(float -2.0, float 1.0) #0
-  ret float %x
+define <3 x double> @positive_sign_arg_vec_splat(<3 x double> %x) {
+; CHECK-LABEL: @positive_sign_arg_vec_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call ninf <3 x double> @llvm.fabs.v3f64(<3 x double> [[X:%.*]])
+; CHECK-NEXT:    ret <3 x double> [[TMP1]]
+;
+  %r = call ninf <3 x double> @llvm.copysign.v3f64(<3 x double> %x, <3 x double> <double 42.0, double 42.0, double 42.0>)
+  ret <3 x double> %r
 }
 
-; CHECK-LABEL: @constant_fold_copysign_f32_03
-; CHECK-NEXT: ret float -2.000000e+00
-define float @constant_fold_copysign_f32_03() #0 {
-  %x = call float @llvm.copysign.f32(float -2.0, float -1.0) #0
-  ret float %x
+define float @negative_sign_arg(float %x) {
+; CHECK-LABEL: @negative_sign_arg(
+; CHECK-NEXT:    [[TMP1:%.*]] = call nnan float @llvm.fabs.f32(float [[X:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg nnan float [[TMP1]]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %r = call nnan float @llvm.copysign.f32(float %x, float -0.0)
+  ret float %r
 }
 
-; CHECK-LABEL: @constant_fold_copysign_f64_01
-; CHECK-NEXT: ret double -1.000000e+00
-define double @constant_fold_copysign_f64_01() #0 {
-  %x = call double @llvm.copysign.f64(double 1.0, double -2.0) #0
-  ret double %x
+define <3 x double> @negative_sign_arg_vec_splat(<3 x double> %x) {
+; CHECK-LABEL: @negative_sign_arg_vec_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast <3 x double> @llvm.fabs.v3f64(<3 x double> [[X:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg fast <3 x double> [[TMP1]]
+; CHECK-NEXT:    ret <3 x double> [[TMP2]]
+;
+  %r = call fast <3 x double> @llvm.copysign.v3f64(<3 x double> %x, <3 x double> <double -42.0, double -42.0, double -42.0>)
+  ret <3 x double> %r
 }
-
-; CHECK-LABEL: @constant_fold_copysign_f64_02
-; CHECK-NEXT: ret double 1.000000e+00
-define double @constant_fold_copysign_f64_02() #0 {
-  %x = call double @llvm.copysign.f64(double -1.0, double 2.0) #0
-  ret double %x
-}
-
-; CHECK-LABEL: @constant_fold_copysign_f64_03
-; CHECK-NEXT: ret double -1.000000e+00
-define double @constant_fold_copysign_f64_03() #0 {
-  %x = call double @llvm.copysign.f64(double -1.0, double -2.0) #0
-  ret double %x
-}
-
-
-attributes #0 = { nounwind readnone }
diff --git a/llvm/test/Transforms/InstCombine/icmp-add.ll b/llvm/test/Transforms/InstCombine/icmp-add.ll
index 1e3875b112c3d..5cf0cfb8d33d8 100644
--- a/llvm/test/Transforms/InstCombine/icmp-add.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-add.ll
@@ -77,14 +77,14 @@ define i1 @test4(i32 %a) {
 
 define { i32, i1 } @test4multiuse(i32 %a) {
 ; CHECK-LABEL: @test4multiuse(
-; CHECK-NEXT:    [[B:%.*]] = add i32 [[A:%.*]], -2147483644
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[B]], -4
+; CHECK-NEXT:    [[B:%.*]] = add nsw i32 [[A:%.*]], -2147483644
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[A]], 2147483640
 ; CHECK-NEXT:    [[TMP:%.*]] = insertvalue { i32, i1 } undef, i32 [[B]], 0
 ; CHECK-NEXT:    [[RES:%.*]] = insertvalue { i32, i1 } [[TMP]], i1 [[C]], 1
 ; CHECK-NEXT:    ret { i32, i1 } [[RES]]
 ;
 
-  %b = add i32 %a, -2147483644
+  %b = add nsw i32 %a, -2147483644
   %c = icmp slt i32 %b, -4
 
   %tmp = insertvalue { i32, i1 } undef, i32 %b, 0
diff --git a/llvm/test/Transforms/InstCombine/memccpy.ll b/llvm/test/Transforms/InstCombine/memccpy.ll
index d911da16278a8..cbb6aa38bd07e 100644
--- a/llvm/test/Transforms/InstCombine/memccpy.ll
+++ b/llvm/test/Transforms/InstCombine/memccpy.ll
@@ -2,13 +2,18 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 @hello = private constant [11 x i8] c"helloworld\00", align 1
+@NoNulTerminator = private constant [10 x i8] c"helloworld", align 1
+@StopCharAfterNulTerminator = private constant [12 x i8] c"helloworld\00x", align 1
+@StringWithEOF =  constant [14 x i8] c"helloworld\FFab\00", align 1
 
 declare i8* @memccpy(i8*, i8*, i32, i64)
 
 define i8* @memccpy_to_memcpy(i8* %dst) {
 ; CHECK-LABEL: @memccpy_to_memcpy(
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 12)
-; CHECK-NEXT:    ret i8* [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[DST:%.*]] to i64*
+; CHECK-NEXT:    store i64 8245940763182785896, i64* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 8
+; CHECK-NEXT:    ret i8* [[TMP2]]
 ;
   %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 12) ; 114 is 'r'
   ret i8* %call
@@ -16,38 +21,123 @@ define i8* @memccpy_to_memcpy(i8* %dst) {
 
 define i8* @memccpy_to_memcpy2(i8* %dst) {
 ; CHECK-LABEL: @memccpy_to_memcpy2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 5)
-; CHECK-NEXT:    ret i8* [[CALL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[DST:%.*]] to i64*
+; CHECK-NEXT:    store i64 8245940763182785896, i64* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 8
+; CHECK-NEXT:    ret i8* [[TMP2]]
 ;
-  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 5)
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 8); ; 114 is 'r'
   ret i8* %call
 }
 
 define void @memccpy_to_memcpy3(i8* %dst) {
 ; CHECK-LABEL: @memccpy_to_memcpy3(
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 5)
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(5) [[DST:%.*]], i8* nonnull align 1 dereferenceable(5) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 5, i1 false)
 ; CHECK-NEXT:    ret void
 ;
-  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 5)
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 111, i64 10) ; 111 is 'o'
   ret void
 }
 
+define void @memccpy_to_memcpy4(i8* %dst) {
+; CHECK-LABEL: @memccpy_to_memcpy4(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(11) [[DST:%.*]], i8* nonnull align 1 dereferenceable(11) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 11, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 0, i64 12)
+  ret void
+}
+
+define i8* @memccpy_to_memcpy5(i8* %dst) {
+; CHECK-LABEL: @memccpy_to_memcpy5(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(7) [[DST:%.*]], i8* nonnull align 1 dereferenceable(7) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 7, i1 false)
+; CHECK-NEXT:    ret i8* null
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 7)
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy6(i8* %dst) {
+; CHECK-LABEL: @memccpy_to_memcpy6(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) [[DST:%.*]], i8* nonnull align 1 dereferenceable(6) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 6, i1 false)
+; CHECK-NEXT:    ret i8* null
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 6);
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy7(i8* %dst) {
+; CHECK-LABEL: @memccpy_to_memcpy7(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(5) [[DST:%.*]], i8* nonnull align 1 dereferenceable(5) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 5, i1 false)
+; CHECK-NEXT:    ret i8* null
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 5) ; 115 is 's'
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy8(i8* %dst) {
+; CHECK-LABEL: @memccpy_to_memcpy8(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(11) [[DST:%.*]], i8* nonnull align 1 dereferenceable(11) getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i64 11, i1 false)
+; CHECK-NEXT:    ret i8* null
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 11) ; 115 is 's'
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy9(i8* %dst, i64 %n) {
+; CHECK-LABEL: @memccpy_to_memcpy9(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(12) [[DST:%.*]], i8* nonnull align 1 dereferenceable(12) getelementptr inbounds ([12 x i8], [12 x i8]* @StopCharAfterNulTerminator, i64 0, i64 0), i64 12, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 12
+; CHECK-NEXT:    ret i8* [[TMP1]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([12 x i8], [12 x i8]* @StopCharAfterNulTerminator, i64 0, i64 0), i32 120, i64 15) ; 120 is 'x'
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy10(i8* %dst, i64 %n) {
+; CHECK-LABEL: @memccpy_to_memcpy10(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(11) [[DST:%.*]], i8* nonnull align 1 dereferenceable(11) getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i64 11, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 11
+; CHECK-NEXT:    ret i8* [[TMP1]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i32 255, i64 15)
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy11(i8* %dst, i64 %n) {
+; CHECK-LABEL: @memccpy_to_memcpy11(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(11) [[DST:%.*]], i8* nonnull align 1 dereferenceable(11) getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i64 11, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 11
+; CHECK-NEXT:    ret i8* [[TMP1]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i32 -1, i64 15)
+  ret i8* %call
+}
+
+define i8* @memccpy_to_memcpy12(i8* %dst, i64 %n) {
+; CHECK-LABEL: @memccpy_to_memcpy12(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(11) [[DST:%.*]], i8* nonnull align 1 dereferenceable(11) getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i64 11, i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 11
+; CHECK-NEXT:    ret i8* [[TMP1]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @StringWithEOF, i64 0, i64 0), i32 1023, i64 15)
+  ret i8* %call
+}
+
 define i8* @memccpy_to_null(i8* %dst, i8* %src, i32 %c) {
 ; CHECK-LABEL: @memccpy_to_null(
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* [[SRC:%.*]], i32 [[C:%.*]], i64 0)
-; CHECK-NEXT:    ret i8* [[CALL]]
+; CHECK-NEXT:    ret i8* null
 ;
   %call = call i8* @memccpy(i8* %dst, i8* %src, i32 %c, i64 0)
   ret i8* %call
 }
 
-define i8* @memccpy_to_null2(i8* %dst) {
-; CHECK-LABEL: @memccpy_to_null2(
-; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 5)
-; CHECK-NEXT:    ret i8* [[CALL]]
+define void @memccpy_dst_src_same_retval_unused(i8* %dst, i32 %c, i64 %n) {
+; CHECK-LABEL: @memccpy_dst_src_same_retval_unused(
+; CHECK-NEXT:    ret void
 ;
-  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 5) ; 115 is 's'
-  ret i8* %call
+  %call = call i8* @memccpy(i8* %dst, i8* %dst, i32 %c, i64 %n)
+  ret void
 }
 
 ; Negative tests
@@ -77,3 +167,48 @@ define i8* @unknown_size_n(i8* %dst, i64 %n) {
   %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 114, i64 %n)
   ret i8* %call
 }
+
+define i8* @no_nul_terminator(i8* %dst, i64 %n) {
+; CHECK-LABEL: @no_nul_terminator(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([12 x i8], [12 x i8]* @StopCharAfterNulTerminator, i64 0, i64 0), i32 120, i64 [[N:%.*]])
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([12 x i8], [12 x i8]* @StopCharAfterNulTerminator, i64 0, i64 0), i32 120, i64 %n) ; 120 is 'x'
+  ret i8* %call
+}
+
+define i8* @possibly_valid_data_after_array(i8* %dst, i64 %n) {
+; CHECK-LABEL: @possibly_valid_data_after_array(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([10 x i8], [10 x i8]* @NoNulTerminator, i64 0, i64 0), i32 115, i64 [[N:%.*]])
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([10 x i8], [10 x i8]* @NoNulTerminator, i64 0, i64 0), i32 115, i64 %n) ; 115 is 's'
+  ret i8* %call
+}
+
+define i8* @possibly_valid_data_after_array2(i8* %dst, i64 %n) {
+; CHECK-LABEL: @possibly_valid_data_after_array2(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 [[N:%.*]])
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 %n) ; 115 is 's'
+  ret i8* %call
+}
+
+define i8* @possibly_valid_data_after_array3(i8* %dst) {
+; CHECK-LABEL: @possibly_valid_data_after_array3(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 12)
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* getelementptr inbounds ([11 x i8], [11 x i8]* @hello, i64 0, i64 0), i32 115, i64 12) ; 115 is 's'
+  ret i8* %call
+}
+
+define i8* @memccpy_dst_src_same_retval_used(i8* %dst, i32 %c, i64 %n) {
+; CHECK-LABEL: @memccpy_dst_src_same_retval_used(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8* @memccpy(i8* [[DST:%.*]], i8* [[DST]], i32 [[C:%.*]], i64 [[N:%.*]])
+; CHECK-NEXT:    ret i8* [[CALL]]
+;
+  %call = call i8* @memccpy(i8* %dst, i8* %dst, i32 %c, i64 %n)
+  ret i8* %call
+}
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index eb8f17f457c68..be8b2459ef8f5 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -521,8 +521,8 @@ define i64 @test_mul_canonicalize_neg_is_not_undone(i64 %L1) {
 define i32 @negate_if_true(i32 %x, i1 %cond) {
 ; CHECK-LABEL: @negate_if_true(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[X]]
-; CHECK-NEXT:    ret i32 [[R]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i32 [[TMP1]], i32 [[X]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %sel = select i1 %cond, i32 -1, i32 1
   %r = mul i32 %sel, %x
@@ -532,8 +532,8 @@ define i32 @negate_if_true(i32 %x, i1 %cond) {
 define i32 @negate_if_false(i32 %x, i1 %cond) {
 ; CHECK-LABEL: @negate_if_false(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 0, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], i32 [[X]], i32 [[TMP1]]
-; CHECK-NEXT:    ret i32 [[R]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i32 [[X]], i32 [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
   %sel = select i1 %cond, i32 1, i32 -1
   %r = mul i32 %sel, %x
@@ -544,8 +544,8 @@ define <2 x i8> @negate_if_true_commute(<2 x i8> %px, i1 %cond) {
 ; CHECK-LABEL: @negate_if_true_commute(
 ; CHECK-NEXT:    [[X:%.*]] = sdiv <2 x i8> <i8 42, i8 42>, [[PX:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <2 x i8> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[R:%.*]] = select i1 [[COND:%.*]], <2 x i8> [[TMP1]], <2 x i8> [[X]]
-; CHECK-NEXT:    ret <2 x i8> [[R]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], <2 x i8> [[TMP1]], <2 x i8> [[X]]
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
 ;
   %x = sdiv <2 x i8> <i8 42, i8 42>, %px  ; thwart complexity-based canonicalization
   %sel = select i1 %cond, <2 x i8> <i8 -1, i8 -1>, <2 x i8> <i8 1, i8 1>
@@ -557,8 +557,8 @@ define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) {
 ; CHECK-LABEL: @negate_if_false_commute(
 ; CHECK-NEXT:    [[X:%.*]] = sdiv <2 x i8> <i8 42, i8 5>, [[PX:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <2 x i8> zeroinitializer, [[X]]
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i8> [[X]], <2 x i8> [[TMP1]]
-; CHECK-NEXT:    ret <2 x i8> [[R]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i8> [[X]], <2 x i8> [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[TMP2]]
 ;
   %x = sdiv <2 x i8> <i8 42, i8 5>, %px  ; thwart complexity-based canonicalization
   %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 undef>, <2 x i8> <i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
index 2d9910352683b..01084312c9f8a 100644
--- a/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
+++ b/llvm/test/Transforms/InstCombine/sub-of-negatible.ll
@@ -155,51 +155,28 @@ define i8 @n8(i8 %x, i1 %y, i8 %z) {
   ret i8 %t2
 }
 
-; Subtraction can be negated if the first operand can be negated
-; x - (y - z) -> x - y + z -> x + (-y) + z
-define i8 @t9(i8 %x, i8 %y, i8 %z) {
+; Subtraction can be negated by swapping its operands.
+; x - (y - z) -> x - y + z -> x + (z - y)
+define i8 @t9(i8 %x, i8 %y) {
 ; CHECK-LABEL: @t9(
-; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
-; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T11:%.*]] = add i8 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[T2:%.*]] = add i8 [[T11]], [[X:%.*]]
-; CHECK-NEXT:    ret i8 [[T2]]
+; CHECK-NEXT:    [[T01:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i8 [[T01]]
 ;
-  %t0 = sub i8 0, %z
-  call void @use8(i8 %t0)
-  %t1 = sub i8 %t0, %y
-  %t2 = sub i8 %x, %t1
-  ret i8 %t2
+  %t0 = sub i8 %y, %x
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
 }
 define i8 @n10(i8 %x, i8 %y, i8 %z) {
 ; CHECK-LABEL: @n10(
-; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
-; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = sub i8 [[T0]], [[Y:%.*]]
-; CHECK-NEXT:    call void @use8(i8 [[T1]])
-; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
-; CHECK-NEXT:    ret i8 [[T2]]
-;
-  %t0 = sub i8 0, %z
-  call void @use8(i8 %t0)
-  %t1 = sub i8 %t0, %y
-  call void @use8(i8 %t1)
-  %t2 = sub i8 %x, %t1
-  ret i8 %t2
-}
-define i8 @n11(i8 %x, i8 %y, i8 %z) {
-; CHECK-LABEL: @n11(
-; CHECK-NEXT:    [[T0:%.*]] = sub i8 0, [[Z:%.*]]
+; CHECK-NEXT:    [[T0:%.*]] = sub i8 [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[T0]])
-; CHECK-NEXT:    [[T1:%.*]] = add i8 [[Y:%.*]], [[Z]]
-; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
-; CHECK-NEXT:    ret i8 [[T2]]
+; CHECK-NEXT:    [[T1:%.*]] = sub i8 0, [[T0]]
+; CHECK-NEXT:    ret i8 [[T1]]
 ;
-  %t0 = sub i8 0, %z
+  %t0 = sub i8 %y, %x
   call void @use8(i8 %t0)
-  %t1 = sub i8 %y, %t0
-  %t2 = sub i8 %x, %t1
-  ret i8 %t2
+  %t1 = sub i8 0, %t0
+  ret i8 %t1
 }
 
 ; Addition can be negated if both operands can be negated
@@ -290,3 +267,111 @@ define i8 @n16(i8 %x, i8 %y, i8 %z) {
   %t2 = sub i8 %x, %t1
   ret i8 %t2
 }
+
+; Phi can be negated if all incoming values can be negated
+define i8 @t16(i1 %c, i8 %x) {
+; CHECK-LABEL: @t16(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[X:%.*]], [[THEN]] ], [ 42, [[ELSE]] ]
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %y = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %z = phi i8 [ %y, %then], [ -42, %else ]
+  %n = sub i8 0, %z
+  ret i8 %n
+}
+define i8 @n17(i1 %c, i8 %x) {
+; CHECK-LABEL: @n17(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[Y:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Y]], [[THEN]] ], [ -42, [[ELSE]] ]
+; CHECK-NEXT:    call void @use8(i8 [[Z]])
+; CHECK-NEXT:    [[N:%.*]] = sub i8 0, [[Z]]
+; CHECK-NEXT:    ret i8 [[N]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %y = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %z = phi i8 [ %y, %then], [ -42, %else ]
+  call void @use8(i8 %z)
+  %n = sub i8 0, %z
+  ret i8 %n
+}
+define i8 @n19(i1 %c, i8 %x, i8 %y) {
+; CHECK-LABEL: @n19(
+; CHECK-NEXT:  begin:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    [[Z:%.*]] = sub i8 0, [[X:%.*]]
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ [[Z]], [[THEN]] ], [ [[Y:%.*]], [[ELSE]] ]
+; CHECK-NEXT:    [[N:%.*]] = sub i8 0, [[R]]
+; CHECK-NEXT:    ret i8 [[N]]
+;
+begin:
+  br i1 %c, label %then, label %else
+then:
+  %z = sub i8 0, %x
+  br label %end
+else:
+  br label %end
+end:
+  %r = phi i8 [ %z, %then], [ %y, %else ]
+  %n = sub i8 0, %r
+  ret i8 %n
+}
+
+; truncation can be negated if it's operand can be negated
+define i8 @t20(i8 %x, i16 %y) {
+; CHECK-LABEL: @t20(
+; CHECK-NEXT:    [[T0:%.*]] = shl i16 -42, [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = trunc i16 [[T0]] to i8
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i16 -42, %y
+  %t1 = trunc i16 %t0 to i8
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
+define i8 @n21(i8 %x, i16 %y) {
+; CHECK-LABEL: @n21(
+; CHECK-NEXT:    [[T0:%.*]] = shl i16 -42, [[Y:%.*]]
+; CHECK-NEXT:    [[T1:%.*]] = trunc i16 [[T0]] to i8
+; CHECK-NEXT:    call void @use8(i8 [[T1]])
+; CHECK-NEXT:    [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]]
+; CHECK-NEXT:    ret i8 [[T2]]
+;
+  %t0 = shl i16 -42, %y
+  %t1 = trunc i16 %t0 to i8
+  call void @use8(i8 %t1)
+  %t2 = sub i8 %x, %t1
+  ret i8 %t2
+}
diff --git a/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll b/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
index 44aa7deb4acc5..60da66b7a388a 100644
--- a/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
+++ b/llvm/test/Transforms/InstCombine/unsigned_saturated_sub.ll
@@ -5,6 +5,8 @@
 ; usub.sat() intrinsics is tested here.
 
 declare void @use(i64)
+declare void @usei32(i32)
+declare void @usei1(i1)
 
 ; (a > b) ? a - b : 0 -> usub.sat(a, b)
 
@@ -32,6 +34,51 @@ define i64 @max_sub_uge(i64 %a, i64 %b) {
   ret i64 %sel
 }
 
+define i64 @max_sub_uge_extrause1(i64 %a, i64 %b) {
+; CHECK-LABEL: @max_sub_uge_extrause1(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    call void @use(i64 [[SUB]])
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %cmp = icmp uge i64 %a, %b
+  %sub = sub i64 %a, %b
+  %sel = select i1 %cmp, i64 %sub ,i64 0
+  call void @use(i64 %sub)
+  ret i64 %sel
+}
+
+define i64 @max_sub_uge_extrause2(i64 %a, i64 %b) {
+; CHECK-LABEL: @max_sub_uge_extrause2(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    call void @usei1(i1 [[CMP]])
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %cmp = icmp uge i64 %a, %b
+  %sub = sub i64 %a, %b
+  %sel = select i1 %cmp, i64 %sub ,i64 0
+  call void @usei1(i1 %cmp)
+  ret i64 %sel
+}
+
+define i64 @max_sub_uge_extrause3(i64 %a, i64 %b) {
+; CHECK-LABEL: @max_sub_uge_extrause3(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    call void @use(i64 [[SUB]])
+; CHECK-NEXT:    call void @usei1(i1 [[CMP]])
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+  %cmp = icmp uge i64 %a, %b
+  %sub = sub i64 %a, %b
+  %sel = select i1 %cmp, i64 %sub ,i64 0
+  call void @use(i64 %sub)
+  call void @usei1(i1 %cmp)
+  ret i64 %sel
+}
+
 ; Again, with vectors:
 ; (a > b) ? a - b : 0 -> usub.sat(a, b)
 
@@ -140,6 +187,53 @@ define i64 @neg_max_sub_ugt_sel_swapped(i64 %a, i64 %b) {
   ret i64 %sel
 }
 
+define i64 @neg_max_sub_ugt_sel_swapped_extrause1(i64 %a, i64 %b) {
+; CHECK-LABEL: @neg_max_sub_ugt_sel_swapped_extrause1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 0, [[TMP1]]
+; CHECK-NEXT:    call void @usei1(i1 [[CMP]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %cmp = icmp ugt i64 %b, %a
+  %sub = sub i64 %b, %a
+  %sel = select i1 %cmp, i64 0 ,i64 %sub
+  call void @usei1(i1 %cmp)
+  ret i64 %sel
+}
+
+define i64 @neg_max_sub_ugt_sel_swapped_extrause2(i64 %a, i64 %b) {
+; CHECK-LABEL: @neg_max_sub_ugt_sel_swapped_extrause2(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A]], i64 [[B]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 0, [[TMP1]]
+; CHECK-NEXT:    call void @use(i64 [[SUB]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
+  %cmp = icmp ugt i64 %b, %a
+  %sub = sub i64 %b, %a
+  %sel = select i1 %cmp, i64 0 ,i64 %sub
+  call void @use(i64 %sub)
+  ret i64 %sel
+}
+
+define i64 @neg_max_sub_ugt_sel_swapped_extrause3(i64 %a, i64 %b) {
+; CHECK-LABEL: @neg_max_sub_ugt_sel_swapped_extrause3(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[B]], [[A]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i64 0, i64 [[SUB]]
+; CHECK-NEXT:    call void @use(i64 [[SUB]])
+; CHECK-NEXT:    call void @usei1(i1 [[CMP]])
+; CHECK-NEXT:    ret i64 [[SEL]]
+;
+  %cmp = icmp ugt i64 %b, %a
+  %sub = sub i64 %b, %a
+  %sel = select i1 %cmp, i64 0 ,i64 %sub
+  call void @use(i64 %sub)
+  call void @usei1(i1 %cmp)
+  ret i64 %sel
+}
+
 ; ((a < b) ? 0 : b - a) -> -usub.sat(a, b)
 
 define i64 @neg_max_sub_ult_sel_swapped(i64 %a, i64 %b) {
@@ -158,3 +252,202 @@ define i64 @neg_max_sub_ult_sel_swapped(i64 %a, i64 %b) {
   ret i64 %sel
 }
 
+define i32 @max_sub_ugt_c1(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c1(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A:%.*]], i32 1)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %cmp = icmp ugt i32 %a, 1
+  %sub = add i32 %a, -1
+  %sel = select i1 %cmp, i32 %sub ,i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c01(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c01(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 0, i32 [[SUB]]
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ugt i32 %a, 0
+  %sub = add i32 %a, -1
+  %sel = select i1 %cmp, i32 %sub ,i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c10(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c10(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A:%.*]], i32 10)
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+  %cmp = icmp ugt i32 %a, 10
+  %sub = add i32 %a, -10
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c910(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c910(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[A:%.*]], 9
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -10
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ugt i32 %a, 9
+  %sub = add i32 %a, -10
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c1110(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c1110(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[A:%.*]], 11
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -10
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ugt i32 %a, 11
+  %sub = add i32 %a, -10
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c0(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c0(
+; CHECK-NEXT:    ret i32 0
+;
+  %cmp = icmp ugt i32 %a, -1
+  %sub = add i32 %a, 0
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_cmiss(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_cmiss(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ugt i32 %a, 1
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c1(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c1(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = sext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ult i32 %a, 1
+  %sub = add i32 %a, -1
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c2(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c2(
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.usub.sat.i32(i32 2, i32 [[A:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i32 0, [[TMP1]]
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %cmp = icmp ult i32 %a, 2
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c2_oneuseicmp(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c2_oneuseicmp(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.usub.sat.i32(i32 2, i32 [[A]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i32 0, [[TMP1]]
+; CHECK-NEXT:    call void @usei1(i1 [[CMP]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %cmp = icmp ult i32 %a, 2
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  call void @usei1(i1 %cmp)
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c2_oneusesub(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c2_oneusesub(
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A:%.*]], -2
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.usub.sat.i32(i32 2, i32 [[A]])
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i32 0, [[TMP1]]
+; CHECK-NEXT:    call void @usei32(i32 [[SUB]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %cmp = icmp ult i32 %a, 2
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  call void @usei32(i32 %sub)
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c32(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ult i32 %a, 3
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ugt_c32(i32 %a) {
+; CHECK-LABEL: @max_sub_ugt_c32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ugt i32 3, %a
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_uge_c32(i32 %a) {
+; CHECK-LABEL: @max_sub_uge_c32(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[A:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[A]], -2
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp uge i32 2, %a
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c12(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -2, i32 0
+; CHECK-NEXT:    ret i32 [[SEL]]
+;
+  %cmp = icmp ult i32 %a, 1
+  %sub = add i32 %a, -2
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
+define i32 @max_sub_ult_c0(i32 %a) {
+; CHECK-LABEL: @max_sub_ult_c0(
+; CHECK-NEXT:    ret i32 0
+;
+  %cmp = icmp ult i32 %a, 0
+  %sub = add i32 %a, -1
+  %sel = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %sel
+}
+
diff --git a/llvm/test/CodeGen/X86/vec_udiv_to_shift.ll b/llvm/test/Transforms/InstCombine/vec_udiv_to_shift.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/vec_udiv_to_shift.ll
rename to llvm/test/Transforms/InstCombine/vec_udiv_to_shift.ll
diff --git a/llvm/test/Transforms/InstCombine/vector-urem.ll b/llvm/test/Transforms/InstCombine/vector-urem.ll
index 113451f8469c4..c9b79b219ce2e 100644
--- a/llvm/test/Transforms/InstCombine/vector-urem.ll
+++ b/llvm/test/Transforms/InstCombine/vector-urem.ll
@@ -39,7 +39,7 @@ define <4 x i32> @test_v4i32_one(<4 x i32> %a0) {
 
 define <4 x i32> @test_v4i32_one_undef(<4 x i32> %a0) {
 ; CHECK-LABEL: @test_v4i32_one_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[A0:%.*]], <i32 1, i32 1, i32 1, i32 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[A0:%.*]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
index 2fc0841a8e15b..4736adb972d9a 100644
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -920,3 +920,22 @@ define double @fmuladd_nan_addend_neginf_inf(double %x, i1 %y) {
   %r = call double @llvm.fmuladd.f64(double %notnan, double 0xfff0000000000000, double 0x7ff0000000000000)
   ret double %r
 }
+
+declare float @llvm.copysign.f32(float, float)
+declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+
+define float @copysign_same_operand(float %x) {
+; CHECK-LABEL: @copysign_same_operand(
+; CHECK-NEXT:    ret float [[X:%.*]]
+;
+  %r = call float @llvm.copysign.f32(float %x, float %x)
+  ret float %r
+}
+
+define <2 x double> @copysign_same_operand_vec(<2 x double> %x) {
+; CHECK-LABEL: @copysign_same_operand_vec(
+; CHECK-NEXT:    ret <2 x double> [[X:%.*]]
+;
+  %r = call <2 x double> @llvm.copysign.v2f64(<2 x double> %x, <2 x double> %x)
+  ret <2 x double> %r
+}
diff --git a/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll b/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
new file mode 100644
index 0000000000000..9ae5f42e1ae7e
--- /dev/null
+++ b/llvm/test/Transforms/LoopInterchange/pr43473-invalid-lcssa-phis-in-inner-exit.ll
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -loop-interchange -S < %s | FileCheck %s
+
+; Test cases for PR43473.
+
+; In the 2 test cases below, we have a LCSSA PHI in the inner loop exit, which
+; is used in the outer loop latch. This is not supported.
+
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds double, double* undef, i64 [[OUTER_IV]]
+; CHECK-NEXT:    br label [[INNER:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[IDX]], align 8
+; CHECK-NEXT:    store double undef, double* [[IDX]], align 8
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
+; CHECK-NEXT:    br i1 false, label [[INNER]], label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[INC43_LCSSA_WIDE_US:%.*]] = phi i64 [ [[INNER_IV_NEXT]], [[INNER]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INC43_LCSSA_WIDE_US]] to i32
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], 1
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[OUTER_EXIT:%.*]]
+; CHECK:       outer.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                    ; preds = %for.cond26.for.end44_crit_edge.us, %entry
+  %outer.iv = phi i64 [ undef, %entry ], [ %outer.iv.next, %outer.latch ]
+  %idx = getelementptr inbounds double, double* undef, i64 %outer.iv
+  br label %inner
+
+inner:                                    ; preds = %for.body28.us, %for.body25.us
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %inner.iv.next, %inner ]
+  %0 = load double, double* %idx, align 8
+  store double undef, double* %idx, align 8
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  br i1 undef, label %inner, label %outer.latch
+
+outer.latch:                ; preds = %inner
+  %inc43.lcssa.wide.us = phi i64 [ %inner.iv.next, %inner ]
+  %1 = trunc i64 %inc43.lcssa.wide.us to i32
+  %outer.iv.next = add nsw i64 %outer.iv, 1
+  br i1 undef, label %outer.header, label %outer.exit
+
+outer.exit:       ; preds = %for.cond26.for.end44_crit_edge.us
+  ret void
+}
+
+; Same as @test1, but with a dedicated inner loop exit block.
+define void @test2() {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = getelementptr inbounds double, double* undef, i64 [[OUTER_IV]]
+; CHECK-NEXT:    br label [[INNER:%.*]]
+; CHECK:       inner:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load double, double* [[IDX]], align 8
+; CHECK-NEXT:    store double undef, double* [[IDX]], align 8
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
+; CHECK-NEXT:    br i1 false, label [[INNER]], label [[INNER_EXIT:%.*]]
+; CHECK:       inner.exit:
+; CHECK-NEXT:    [[INC43_LCSSA_WIDE_US:%.*]] = phi i64 [ [[INNER_IV_NEXT]], [[INNER]] ]
+; CHECK-NEXT:    br label [[OUTER_LATCH]]
+; CHECK:       outer.latch:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INC43_LCSSA_WIDE_US]] to i32
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], 1
+; CHECK-NEXT:    br i1 false, label [[OUTER_HEADER]], label [[OUTER_EXIT:%.*]]
+; CHECK:       outer.exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                    ; preds = %for.cond26.for.end44_crit_edge.us, %entry
+  %outer.iv = phi i64 [ undef, %entry ], [ %outer.iv.next, %outer.latch ]
+  %idx = getelementptr inbounds double, double* undef, i64 %outer.iv
+  br label %inner
+
+inner:                                    ; preds = %for.body28.us, %for.body25.us
+  %inner.iv = phi i64 [ 0, %outer.header ], [ %inner.iv.next, %inner ]
+  %0 = load double, double* %idx, align 8
+  store double undef, double* %idx, align 8
+  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
+  br i1 undef, label %inner, label %inner.exit
+
+inner.exit:
+  %inc43.lcssa.wide.us = phi i64 [ %inner.iv.next, %inner ]
+  br label %outer.latch
+
+outer.latch:                ; preds = %inner
+  %1 = trunc i64 %inc43.lcssa.wide.us to i32
+  %outer.iv.next = add nsw i64 %outer.iv, 1
+  br i1 undef, label %outer.header, label %outer.exit
+
+outer.exit:       ; preds = %for.cond26.for.end44_crit_edge.us
+  ret void
+}
+
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
index 9f97358668b61..e91bd2eff9c0c 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
@@ -578,10 +578,10 @@ define void @test8() {
 ; PROLOG:      %lcmp.mod = icmp eq i64
 ; PROLOG-NEXT: br i1 %lcmp.mod, label %innerH.prol.loopexit, label %innerH.prol.preheader
 ; PROLOG: latch.6:
-; PROLOG-NEXT: %tmp4.7 = add nuw nsw i64 %tmp3, 8
 ; PROLOG-NEXT: br i1 false, label %outerloop.loopexit.loopexit, label %latch.7
-; PROLOG: latch.7
-; PROLOG-NEXT: %tmp6.7 = icmp ult i64 %tmp4.7, 100
+; PROLOG: latch.7:
+; PROLOG-NEXT: %tmp4.7 = add nuw nsw i64 %tmp3, 8
+; PROLOG-NEXT: %tmp6.7 = icmp ult i64 %tmp3, 92
 ; PROLOG-NEXT: br i1 %tmp6.7, label %innerH, label %exit.unr-lcssa
 bb:
   br label %outerloop
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index e18159f246242..93285accb06cf 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -1,5 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: opt < %s -loop-vectorize -instcombine -S -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -S | FileCheck %s -check-prefix=FORCE
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -65,3 +66,85 @@ for.end:
 }
 
 attributes #0 = { "target-cpu"="knl" }
+
+; CHECK-LABEL: PR40816
+;
+; Check that scalar with predication instructions are not considered uniform
+; after vectorization, because that results in replicating a region instead of
+; having a single instance (out of VF). The predication stems from a tiny count
+; of 3 leading to folding the tail by masking using icmp ule <i, i+1> <= <2, 2>.
+;
+; CHECK:     LV: Found trip count: 3
+; CHECK:     LV: Found uniform instruction:   {{%.*}} = icmp eq i32 {{%.*}}, 0
+; CHECK-NOT: LV: Found uniform instruction:   {{%.*}} = load i32, i32* {{%.*}}, align 1
+; CHECK:     LV: Found not uniform being ScalarWithPredication:  {{%.*}} = load i32, i32* {{%.*}}, align 1
+; CHECK:     LV: Found scalar instruction:   {{%.*}} = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 {{%.*}}
+;
+; FORCE-LABEL: @PR40816(
+; FORCE-NEXT:  entry:
+; FORCE-NEXT:    br i1 false, label {{%.*}}, label [[VECTOR_PH:%.*]]
+; FORCE:       vector.ph:
+; FORCE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FORCE:       vector.body:
+; FORCE-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
+; FORCE-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
+; FORCE-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; FORCE-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; FORCE-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; FORCE-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; FORCE:       pred.store.if:
+; FORCE-NEXT:    store i32 [[TMP0]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; FORCE:       pred.store.continue:
+; FORCE-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; FORCE-NEXT:    br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]]
+; FORCE:       pred.store.if1:
+; FORCE-NEXT:    store i32 [[TMP1]], i32* @b, align 1
+; FORCE-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; FORCE:       pred.store.continue2:
+; FORCE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; FORCE-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; FORCE:       pred.load.if:
+; FORCE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP0]]
+; FORCE-NEXT:    [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 1
+; FORCE-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[TMP7]], i32 0
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; FORCE:       pred.load.continue:
+; FORCE-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ undef, [[PRED_STORE_CONTINUE2]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; FORCE-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; FORCE-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; FORCE:       pred.load.if3:
+; FORCE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 [[TMP1]]
+; FORCE-NEXT:    [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 1
+; FORCE-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i32 1
+; FORCE-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; FORCE:       pred.load.continue4:
+; FORCE-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
+; FORCE-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; FORCE-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; FORCE-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
+; FORCE-NEXT:    br i1 [[TMP15]], label {{%.*}}, label [[VECTOR_BODY]]
+;
+@a = internal constant [3 x i32] [i32 7, i32 7, i32 0], align 1
+@b = external global i32, align 1
+
+define void @PR40816() #1 {
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %0 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  store i32 %0, i32* @b, align 1
+  %arrayidx1 = getelementptr inbounds [3 x i32], [3 x i32]* @a, i32 0, i32 %0
+  %1 = load i32, i32* %arrayidx1, align 1
+  %cmp2 = icmp eq i32 %1, 0
+  %inc = add nuw nsw i32 %0, 1
+  br i1 %cmp2, label %return, label %for.body
+
+return:                                           ; preds = %for.body
+  ret void
+}
+
+attributes #1 = { "target-cpu"="core2" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 9294c92b5759f..f12f35702156f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine < %s | FileCheck %s --check-prefix=NORMAL
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=NORMAL
-; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=ATOM
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=slm < %s | FileCheck %s --check-prefix=SLOW
+; RUN: opt -S -mtriple=x86_64-pc_linux -loop-vectorize -instcombine -mcpu=atom < %s | FileCheck %s --check-prefix=SLOW
 
 ; NORMAL-LABEL: foo
 ; NORMAL: %[[WIDE:.*]] = load <8 x i32>, <8 x i32>* %{{.*}}, align 4
@@ -8,10 +8,10 @@
 ; NORMAL: %[[STRIDED2:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; NORMAL: add nsw <4 x i32> %[[STRIDED2]], %[[STRIDED1]]
 
-; ATOM-LABEL: foo
-; ATOM: load i32
-; ATOM: load i32
-; ATOM: store i32
+; SLOW-LABEL: foo
+; SLOW: load i32
+; SLOW: load i32
+; SLOW: store i32
 define void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
index e09804276ec83..aa913172f7b57 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll
@@ -243,3 +243,33 @@ for:
 exit:
   ret void
 }
+
+; TODO: We should be able to sink %tmp38 after %tmp60.
+define void @instruction_with_2_FOR_operands() {
+; CHECK-LABEL: define void @instruction_with_2_FOR_operands(
+; CHECK-NEXT: bb:
+; CHECK-NEXT:   br label %bb13
+
+; CHECK-LABEL: bb13:
+; CHECK:         br i1 %tmp12, label %bb13, label %bb74
+
+; CHECK-LABEL: bb74:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %bb13
+
+bb13:                                             ; preds = %bb13, %bb
+  %tmp37 = phi float [ %tmp60, %bb13 ], [ undef, %bb ]
+  %tmp27 = phi float [ %tmp49, %bb13 ], [ undef, %bb ]
+  %indvars.iv = phi i64 [ %indvars.iv.next, %bb13 ], [ 0, %bb ]
+  %tmp38 = fmul fast float %tmp37, %tmp27
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %tmp49 = load float, float* undef, align 4
+  %tmp60 = load float, float* undef, align 4
+  %tmp12 = icmp slt i64 %indvars.iv, undef
+  br i1 %tmp12, label %bb13, label %bb74
+
+bb74:                                             ; preds = %bb13
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
index f254bc81a7c74..c1b29f80cade4 100644
--- a/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -12,7 +12,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1
@@ -22,33 +22,33 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP4]], [[A]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]]
-; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !3
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i1> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <4 x i1> [[TMP12]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP17:%.*]] = and <4 x i1> [[TMP11]], [[TMP16]]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
-; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[PREDPHI]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP18]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !3
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 19, i32 19, i32 19, i32 19>
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i32> <i32 4, i32 4, i32 4, i32 4>, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <4 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[TMP11]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[TMP16:%.*]] = and <4 x i1> [[TMP10]], [[TMP15]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x i32> <i32 9, i32 9, i32 9, i32 9>
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[TMP13]], <4 x i32> [[PREDPHI]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP17]], align 4, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -58,16 +58,16 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[IF_END14:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[IF_THEN:%.*]], label [[IF_END14]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP20]], 19
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP19]], 19
 ; CHECK-NEXT:    br i1 [[CMP6]], label [[IF_END14]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.else:
-; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP21]], 4
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp slt i32 [[TMP20]], 4
 ; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[CMP10]], i32 4, i32 5
 ; CHECK-NEXT:    br label [[IF_END14]]
 ; CHECK:       if.end14:
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 3f462425658c0..ecabef22f7255 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -19,7 +19,7 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtab
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1, !dbg !9
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg !9
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg !9
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4, !dbg !9
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3, !dbg !9
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]], !dbg !9
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[N]], -1, !dbg !9
@@ -29,34 +29,34 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtab
 ; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 [[TMP5]], !dbg !9
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[A]], !dbg !9
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]], !dbg !9
-; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], !dbg !9
-; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], !dbg !9
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], !dbg !9
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], !dbg !9
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, !dbg !9
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg !9
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg !9
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], !dbg !9
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*, !dbg !9
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP8]], align 4, !dbg !9, !alias.scope !10
-; CHECK-NEXT:    [[TMP9:%.*]] = fmul <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, !dbg !9
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], !dbg !9
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*, !dbg !9
-; CHECK-NEXT:    store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4, !dbg !9, !alias.scope !13, !noalias !10
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]], !dbg !9
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*, !dbg !9
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !dbg !9, !alias.scope !10
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, !dbg !9
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]], !dbg !9
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*, !dbg !9
+; CHECK-NEXT:    store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4, !dbg !9, !alias.scope !13, !noalias !10
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4, !dbg !9
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg !9
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg !9, !llvm.loop !15
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg !9
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg !9, !llvm.loop !15
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg !9
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg !9
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ], !dbg !9
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg !9
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], !dbg !9
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]], !dbg !9
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg !9
-; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP13]], 3.000000e+00, !dbg !9
+; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg !9
+; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP12]], 3.000000e+00, !dbg !9
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]], !dbg !9
 ; CHECK-NEXT:    store float [[MUL]], float* [[ARRAYIDX2]], align 4, !dbg !9
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1, !dbg !9
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
index 8f8b1d443da84..9ee016e4331b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll
@@ -35,30 +35,9 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) {
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3
-; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4
-; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5
-; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6
-; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7
-; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT:    [[AB2:%.*]] = sitofp i32 [[A2]] to float
-; SLM-NEXT:    [[AB3:%.*]] = sitofp i32 [[A3]] to float
-; SLM-NEXT:    [[AB4:%.*]] = uitofp i32 [[A4]] to float
-; SLM-NEXT:    [[AB5:%.*]] = uitofp i32 [[A5]] to float
-; SLM-NEXT:    [[AB6:%.*]] = uitofp i32 [[A6]] to float
-; SLM-NEXT:    [[AB7:%.*]] = uitofp i32 [[A7]] to float
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
-; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
-; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
-; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
-; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <8 x i32> [[A:%.*]] to <8 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <8 x i32> [[A]] to <8 x float>
+; SLM-NEXT:    [[R7:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
 ; SLM-NEXT:    ret <8 x float> [[R7]]
 ;
 ; AVX-LABEL: @sitofp_uitofp(
@@ -268,11 +247,50 @@ define <8 x float> @fneg_fabs(<8 x float> %a) {
 }
 
 define <8 x i32> @sext_zext(<8 x i16> %a) {
-; CHECK-LABEL: @sext_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
-; CHECK-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    ret <8 x i32> [[R7]]
+; SSE-LABEL: @sext_zext(
+; SSE-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; SSE-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; SSE-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <8 x i32> [[R7]]
+;
+; SLM-LABEL: @sext_zext(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <8 x i16> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <8 x i16> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i32 3
+; SLM-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i32 4
+; SLM-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i32 5
+; SLM-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i32 6
+; SLM-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i32 7
+; SLM-NEXT:    [[AB0:%.*]] = sext i16 [[A0]] to i32
+; SLM-NEXT:    [[AB1:%.*]] = sext i16 [[A1]] to i32
+; SLM-NEXT:    [[AB2:%.*]] = sext i16 [[A2]] to i32
+; SLM-NEXT:    [[AB3:%.*]] = sext i16 [[A3]] to i32
+; SLM-NEXT:    [[AB4:%.*]] = zext i16 [[A4]] to i32
+; SLM-NEXT:    [[AB5:%.*]] = zext i16 [[A5]] to i32
+; SLM-NEXT:    [[AB6:%.*]] = zext i16 [[A6]] to i32
+; SLM-NEXT:    [[AB7:%.*]] = zext i16 [[A7]] to i32
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3
+; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4
+; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5
+; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6
+; SLM-NEXT:    [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX-LABEL: @sext_zext(
+; AVX-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:    ret <8 x i32> [[R7]]
+;
+; AVX512-LABEL: @sext_zext(
+; AVX512-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[A:%.*]] to <8 x i32>
+; AVX512-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[A]] to <8 x i32>
+; AVX512-NEXT:    [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT:    ret <8 x i32> [[R7]]
 ;
   %a0 = extractelement <8 x i16> %a, i32 0
   %a1 = extractelement <8 x i16> %a, i32 1
@@ -383,26 +401,24 @@ define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16
 ; SSE-NEXT:    ret <8 x float> [[R7]]
 ;
 ; SLM-LABEL: @sitofp_uitofp_4i32_8i16_16i8(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
 ; SLM-NEXT:    [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0
 ; SLM-NEXT:    [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1
 ; SLM-NEXT:    [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0
 ; SLM-NEXT:    [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1
-; SLM-NEXT:    [[AB0:%.*]] = sitofp i32 [[A0]] to float
-; SLM-NEXT:    [[AB1:%.*]] = sitofp i32 [[A1]] to float
-; SLM-NEXT:    [[AB2:%.*]] = uitofp i32 [[A2]] to float
-; SLM-NEXT:    [[AB3:%.*]] = uitofp i32 [[A3]] to float
+; SLM-NEXT:    [[TMP1:%.*]] = sitofp <4 x i32> [[A:%.*]] to <4 x float>
+; SLM-NEXT:    [[TMP2:%.*]] = uitofp <4 x i32> [[A]] to <4 x float>
 ; SLM-NEXT:    [[AB4:%.*]] = sitofp i16 [[B0]] to float
 ; SLM-NEXT:    [[AB5:%.*]] = uitofp i16 [[B1]] to float
 ; SLM-NEXT:    [[AB6:%.*]] = sitofp i8 [[C0]] to float
 ; SLM-NEXT:    [[AB7:%.*]] = uitofp i8 [[C1]] to float
-; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
+; SLM-NEXT:    [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP3]], i32 0
+; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
+; SLM-NEXT:    [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP4]], i32 1
+; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; SLM-NEXT:    [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP5]], i32 2
+; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; SLM-NEXT:    [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP6]], i32 3
 ; SLM-NEXT:    [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4
 ; SLM-NEXT:    [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5
 ; SLM-NEXT:    [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
index 44729b4a8d5a2..23d1634fdb6ce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll
@@ -75,42 +75,11 @@ define <4 x i32> @add_and_v4i32(<4 x i32> %a, <4 x i32> %b) {
 }
 
 define <4 x i32> @add_mul_v4i32(<4 x i32> %a, <4 x i32> %b) {
-; SSE-LABEL: @add_mul_v4i32(
-; SSE-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; SSE-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; SSE-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; SSE-NEXT:    ret <4 x i32> [[R3]]
-;
-; SLM-LABEL: @add_mul_v4i32(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1
-; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2
-; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3
-; SLM-NEXT:    [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1
-; SLM-NEXT:    [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2
-; SLM-NEXT:    [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3
-; SLM-NEXT:    [[AB0:%.*]] = mul i32 [[A0]], [[B0]]
-; SLM-NEXT:    [[AB1:%.*]] = add i32 [[A1]], [[B1]]
-; SLM-NEXT:    [[AB2:%.*]] = add i32 [[A2]], [[B2]]
-; SLM-NEXT:    [[AB3:%.*]] = mul i32 [[A3]], [[B3]]
-; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0
-; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1
-; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2
-; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[R3]]
-;
-; AVX-LABEL: @add_mul_v4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT:    ret <4 x i32> [[R3]]
-;
-; AVX512-LABEL: @add_mul_v4i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
-; AVX512-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
-; AVX512-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX512-NEXT:    ret <4 x i32> [[R3]]
+; CHECK-LABEL: @add_mul_v4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[R3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[R3]]
 ;
   %a0 = extractelement <4 x i32> %a, i32 0
   %a1 = extractelement <4 x i32> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index b02244f9614bf..71f72a93075b4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
 }
 
 define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = add i64 [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = add i64 [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %a0 = extractelement <2 x i64> %a, i32 0
   %a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-NEXT:    ret <4 x i64> [[R03]]
 ;
 ; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v4i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    ret <8 x i32> [[R07]]
 ;
 ; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R07]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index d6e44aa1d6a04..b7e487eed9eb9 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -78,34 +78,11 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b) {
 }
 
 define <2 x i64> @test_v2i64(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: @test_v2i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; SSE-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; SLM-LABEL: @test_v2i64(
-; SLM-NEXT:    [[A0:%.*]] = extractelement <2 x i64> [[A:%.*]], i32 0
-; SLM-NEXT:    [[A1:%.*]] = extractelement <2 x i64> [[A]], i32 1
-; SLM-NEXT:    [[B0:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0
-; SLM-NEXT:    [[B1:%.*]] = extractelement <2 x i64> [[B]], i32 1
-; SLM-NEXT:    [[R0:%.*]] = sub i64 [[A0]], [[A1]]
-; SLM-NEXT:    [[R1:%.*]] = sub i64 [[B0]], [[B1]]
-; SLM-NEXT:    [[R00:%.*]] = insertelement <2 x i64> undef, i64 [[R0]], i32 0
-; SLM-NEXT:    [[R01:%.*]] = insertelement <2 x i64> [[R00]], i64 [[R1]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[R01]]
-;
-; AVX-LABEL: @test_v2i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <2 x i64> [[TMP3]]
-;
-; AVX512-LABEL: @test_v2i64(
-; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
-; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-; AVX512-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; AVX512-NEXT:    ret <2 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[A:%.*]], <2 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[TMP3]]
 ;
   %a0 = extractelement <2 x i64> %a, i32 0
   %a1 = extractelement <2 x i64> %a, i32 1
@@ -322,14 +299,10 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE-NEXT:    ret <4 x i64> [[R03]]
 ;
 ; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[R03]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <4 x i64> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v4i64(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -374,14 +347,10 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 ; SSE-NEXT:    ret <8 x i32> [[R07]]
 ;
 ; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP4]], [[TMP5]]
-; SLM-NEXT:    [[R07:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[R07]]
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SLM-NEXT:    ret <8 x i32> [[TMP3]]
 ;
 ; AVX-LABEL: @test_v8i32(
 ; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
index 5124bab7f5d9f..2a41de10f09ce 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll
@@ -55,21 +55,20 @@ define { <2 x float>, <2 x float> } @StructOfVectors(float *%Ptr) {
 define [2 x %StructTy] @ArrayOfStruct(float *%Ptr) {
 ; CHECK-LABEL: @ArrayOfStruct(
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[GEP0]]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
-; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[GEP1]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
-; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[GEP2]]
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
-; CHECK-NEXT:    [[L3:%.*]] = load float, float* [[GEP3]]
-; CHECK-NEXT:    [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01
-; CHECK-NEXT:    [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01
-; CHECK-NEXT:    [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01
-; CHECK-NEXT:    [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01
-; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[FADD1]], 1
-; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[FADD2]], 0
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[FADD3]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1
 ; CHECK-NEXT:    [[RET0:%.*]] = insertvalue [2 x %StructTy] undef, [[STRUCTTY]] %StructIn1, 0
 ; CHECK-NEXT:    [[RET1:%.*]] = insertvalue [2 x %StructTy] [[RET0]], [[STRUCTTY]] %StructIn3, 1
 ; CHECK-NEXT:    ret [2 x %StructTy] [[RET1]]
@@ -102,21 +101,20 @@ define [2 x %StructTy] @ArrayOfStruct(float *%Ptr) {
 define {%StructTy, %StructTy} @StructOfStruct(float *%Ptr) {
 ; CHECK-LABEL: @StructOfStruct(
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds float, float* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[L0:%.*]] = load float, float* [[GEP0]]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 1
-; CHECK-NEXT:    [[L1:%.*]] = load float, float* [[GEP1]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 2
-; CHECK-NEXT:    [[L2:%.*]] = load float, float* [[GEP2]]
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds float, float* [[PTR]], i64 3
-; CHECK-NEXT:    [[L3:%.*]] = load float, float* [[GEP3]]
-; CHECK-NEXT:    [[FADD0:%.*]] = fadd fast float [[L0]], 1.100000e+01
-; CHECK-NEXT:    [[FADD1:%.*]] = fadd fast float [[L1]], 1.200000e+01
-; CHECK-NEXT:    [[FADD2:%.*]] = fadd fast float [[L2]], 1.300000e+01
-; CHECK-NEXT:    [[FADD3:%.*]] = fadd fast float [[L3]], 1.400000e+01
-; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[FADD0]], 0
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[FADD1]], 1
-; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[FADD2]], 0
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[FADD3]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.100000e+01, float 1.200000e+01, float 1.300000e+01, float 1.400000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCTTY:%.*]] undef, float [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCTTY]] %StructIn0, float [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCTTY]] undef, float [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCTTY]] %StructIn2, float [[TMP7]], 1
 ; CHECK-NEXT:    [[RET0:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } undef, [[STRUCTTY]] %StructIn1, 0
 ; CHECK-NEXT:    [[RET1:%.*]] = insertvalue { [[STRUCTTY]], [[STRUCTTY]] } [[RET0]], [[STRUCTTY]] %StructIn3, 1
 ; CHECK-NEXT:    ret { [[STRUCTTY]], [[STRUCTTY]] } [[RET1]]
@@ -196,37 +194,32 @@ define {%StructTy, float, float} @NonHomogeneousStruct(float *%Ptr) {
 define {%Struct2Ty, %Struct2Ty} @StructOfStructOfStruct(i16 *%Ptr) {
 ; CHECK-LABEL: @StructOfStructOfStruct(
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[L0:%.*]] = load i16, i16* [[GEP0]]
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 1
-; CHECK-NEXT:    [[L1:%.*]] = load i16, i16* [[GEP1]]
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 2
-; CHECK-NEXT:    [[L2:%.*]] = load i16, i16* [[GEP2]]
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 3
-; CHECK-NEXT:    [[L3:%.*]] = load i16, i16* [[GEP3]]
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 4
-; CHECK-NEXT:    [[L4:%.*]] = load i16, i16* [[GEP4]]
 ; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 5
-; CHECK-NEXT:    [[L5:%.*]] = load i16, i16* [[GEP5]]
 ; CHECK-NEXT:    [[GEP6:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 6
-; CHECK-NEXT:    [[L6:%.*]] = load i16, i16* [[GEP6]]
 ; CHECK-NEXT:    [[GEP7:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 7
-; CHECK-NEXT:    [[L7:%.*]] = load i16, i16* [[GEP7]]
-; CHECK-NEXT:    [[FADD0:%.*]] = add i16 [[L0]], 1
-; CHECK-NEXT:    [[FADD1:%.*]] = add i16 [[L1]], 2
-; CHECK-NEXT:    [[FADD2:%.*]] = add i16 [[L2]], 3
-; CHECK-NEXT:    [[FADD3:%.*]] = add i16 [[L3]], 4
-; CHECK-NEXT:    [[FADD4:%.*]] = add i16 [[L4]], 5
-; CHECK-NEXT:    [[FADD5:%.*]] = add i16 [[L5]], 6
-; CHECK-NEXT:    [[FADD6:%.*]] = add i16 [[L6]], 7
-; CHECK-NEXT:    [[FADD7:%.*]] = add i16 [[L7]], 8
-; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[FADD0]], 0
-; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] %StructIn0, i16 [[FADD1]], 1
-; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[FADD2]], 0
-; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] %StructIn2, i16 [[FADD3]], 1
-; CHECK-NEXT:    [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[FADD4]], 0
-; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] %StructIn4, i16 [[FADD5]], 1
-; CHECK-NEXT:    [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[FADD6]], 0
-; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] %StructIn6, i16 [[FADD7]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[GEP0]] to <8 x i16>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP2]], <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; CHECK-NEXT:    [[STRUCTIN0:%.*]] = insertvalue [[STRUCT1TY:%.*]] undef, i16 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; CHECK-NEXT:    [[STRUCTIN1:%.*]] = insertvalue [[STRUCT1TY]] %StructIn0, i16 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; CHECK-NEXT:    [[STRUCTIN2:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; CHECK-NEXT:    [[STRUCTIN3:%.*]] = insertvalue [[STRUCT1TY]] %StructIn2, i16 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; CHECK-NEXT:    [[STRUCTIN4:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; CHECK-NEXT:    [[STRUCTIN5:%.*]] = insertvalue [[STRUCT1TY]] %StructIn4, i16 [[TMP9]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; CHECK-NEXT:    [[STRUCTIN6:%.*]] = insertvalue [[STRUCT1TY]] undef, i16 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; CHECK-NEXT:    [[STRUCTIN7:%.*]] = insertvalue [[STRUCT1TY]] %StructIn6, i16 [[TMP11]], 1
 ; CHECK-NEXT:    [[STRUCT2IN0:%.*]] = insertvalue [[STRUCT2TY:%.*]] undef, [[STRUCT1TY]] %StructIn1, 0
 ; CHECK-NEXT:    [[STRUCT2IN1:%.*]] = insertvalue [[STRUCT2TY]] %Struct2In0, [[STRUCT1TY]] %StructIn3, 1
 ; CHECK-NEXT:    [[STRUCT2IN2:%.*]] = insertvalue [[STRUCT2TY]] undef, [[STRUCT1TY]] %StructIn5, 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
index f3404831e213f..c3eba4701e996 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll
@@ -11,26 +11,15 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,40 +43,23 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i32(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
-; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
-; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i32> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i32(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i32>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i32> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i32(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i32> [[V3]]
 ;
 ; AVX-LABEL: @loadext_4i8_to_4i32(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -125,40 +97,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i8_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -232,34 +187,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i16> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -296,34 +314,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -360,58 +441,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT:    ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = sext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = sext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = sext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = sext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = sext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = sext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = sext i8 [[I7]] to i16
+; SLM-NEXT:    [[X8:%.*]] = sext i8 [[I8]] to i16
+; SLM-NEXT:    [[X9:%.*]] = sext i8 [[I9]] to i16
+; SLM-NEXT:    [[X10:%.*]] = sext i8 [[I10]] to i16
+; SLM-NEXT:    [[X11:%.*]] = sext i8 [[I11]] to i16
+; SLM-NEXT:    [[X12:%.*]] = sext i8 [[I12]] to i16
+; SLM-NEXT:    [[X13:%.*]] = sext i8 [[I13]] to i16
+; SLM-NEXT:    [[X14:%.*]] = sext i8 [[I14]] to i16
+; SLM-NEXT:    [[X15:%.*]] = sext i8 [[I15]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT:    ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT:    ret <16 x i16> [[V15]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -484,26 +684,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -527,22 +716,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -563,40 +787,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i16_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -670,34 +877,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = sext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = sext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = sext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = sext i16 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = sext i16 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = sext i16 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = sext i16 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = sext i16 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -738,26 +1008,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -781,40 +1040,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = sext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = sext i32 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = sext i32 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = sext i32 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i32_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
index d82aeb856768f..ead4ffdeb0fcf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll
@@ -11,26 +11,15 @@
 ;
 
 define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
-; SSE2-LABEL: @loadext_2i8_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i8_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i8_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i8_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -54,22 +43,57 @@ define <2 x i64> @loadext_2i8_to_2i64(i8* %p0) {
 }
 
 define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
-; CHECK-LABEL: @loadext_4i8_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i8_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i8_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i8_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -90,40 +114,23 @@ define <4 x i32> @loadext_4i8_to_4i32(i8* %p0) {
 }
 
 define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
-; SSE2-LABEL: @loadext_4i8_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i8_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <4 x i8>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i8_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i8_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
@@ -197,34 +204,97 @@ define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) {
 }
 
 define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i16> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i16> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i16> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i16> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -261,34 +331,97 @@ define <8 x i16> @loadext_8i8_to_8i16(i8* %p0) {
 }
 
 define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
-; CHECK-LABEL: @loadext_8i8_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i8_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i8_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i8_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <8 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i8, i8* %p0, i64 1
   %p2 = getelementptr inbounds i8, i8* %p0, i64 2
@@ -325,58 +458,177 @@ define <8 x i32> @loadext_8i8_to_8i32(i8* %p0) {
 }
 
 define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
-; CHECK-LABEL: @loadext_16i8_to_16i16(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
-; CHECK-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
-; CHECK-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
-; CHECK-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
-; CHECK-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
-; CHECK-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
-; CHECK-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
-; CHECK-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
-; CHECK-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
-; CHECK-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
-; CHECK-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
-; CHECK-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
-; CHECK-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
-; CHECK-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
-; CHECK-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
-; CHECK-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
-; CHECK-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
-; CHECK-NEXT:    ret <16 x i16> [[V15]]
+; SSE2-LABEL: @loadext_16i8_to_16i16(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SSE2-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SSE2-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SSE2-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SSE2-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SSE2-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SSE2-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SSE2-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SSE2-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; SSE2-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; SSE2-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; SSE2-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; SSE2-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; SSE2-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; SSE2-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; SSE2-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; SSE2-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; SSE2-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; SSE2-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; SSE2-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; SSE2-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; SSE2-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; SSE2-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; SSE2-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; SSE2-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; SSE2-NEXT:    ret <16 x i16> [[V15]]
+;
+; SLM-LABEL: @loadext_16i8_to_16i16(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; SLM-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; SLM-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; SLM-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; SLM-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; SLM-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; SLM-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; SLM-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; SLM-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; SLM-NEXT:    [[I0:%.*]] = load i8, i8* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i8, i8* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i8, i8* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i8, i8* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i8, i8* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i8, i8* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i8, i8* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i8, i8* [[P7]], align 1
+; SLM-NEXT:    [[I8:%.*]] = load i8, i8* [[P8]], align 1
+; SLM-NEXT:    [[I9:%.*]] = load i8, i8* [[P9]], align 1
+; SLM-NEXT:    [[I10:%.*]] = load i8, i8* [[P10]], align 1
+; SLM-NEXT:    [[I11:%.*]] = load i8, i8* [[P11]], align 1
+; SLM-NEXT:    [[I12:%.*]] = load i8, i8* [[P12]], align 1
+; SLM-NEXT:    [[I13:%.*]] = load i8, i8* [[P13]], align 1
+; SLM-NEXT:    [[I14:%.*]] = load i8, i8* [[P14]], align 1
+; SLM-NEXT:    [[I15:%.*]] = load i8, i8* [[P15]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i8 [[I0]] to i16
+; SLM-NEXT:    [[X1:%.*]] = zext i8 [[I1]] to i16
+; SLM-NEXT:    [[X2:%.*]] = zext i8 [[I2]] to i16
+; SLM-NEXT:    [[X3:%.*]] = zext i8 [[I3]] to i16
+; SLM-NEXT:    [[X4:%.*]] = zext i8 [[I4]] to i16
+; SLM-NEXT:    [[X5:%.*]] = zext i8 [[I5]] to i16
+; SLM-NEXT:    [[X6:%.*]] = zext i8 [[I6]] to i16
+; SLM-NEXT:    [[X7:%.*]] = zext i8 [[I7]] to i16
+; SLM-NEXT:    [[X8:%.*]] = zext i8 [[I8]] to i16
+; SLM-NEXT:    [[X9:%.*]] = zext i8 [[I9]] to i16
+; SLM-NEXT:    [[X10:%.*]] = zext i8 [[I10]] to i16
+; SLM-NEXT:    [[X11:%.*]] = zext i8 [[I11]] to i16
+; SLM-NEXT:    [[X12:%.*]] = zext i8 [[I12]] to i16
+; SLM-NEXT:    [[X13:%.*]] = zext i8 [[I13]] to i16
+; SLM-NEXT:    [[X14:%.*]] = zext i8 [[I14]] to i16
+; SLM-NEXT:    [[X15:%.*]] = zext i8 [[I15]] to i16
+; SLM-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7
+; SLM-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8
+; SLM-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9
+; SLM-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10
+; SLM-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11
+; SLM-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12
+; SLM-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13
+; SLM-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14
+; SLM-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15
+; SLM-NEXT:    ret <16 x i16> [[V15]]
+;
+; AVX-LABEL: @loadext_16i8_to_16i16(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7
+; AVX-NEXT:    [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8
+; AVX-NEXT:    [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9
+; AVX-NEXT:    [[P10:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10
+; AVX-NEXT:    [[P11:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11
+; AVX-NEXT:    [[P12:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12
+; AVX-NEXT:    [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13
+; AVX-NEXT:    [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14
+; AVX-NEXT:    [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i8* [[P0]] to <16 x i8>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i16>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <16 x i16> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <16 x i16> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <16 x i16> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <16 x i16> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <16 x i16> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <16 x i16> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <16 x i16> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <16 x i16> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP11]], i32 7
+; AVX-NEXT:    [[TMP12:%.*]] = extractelement <16 x i16> [[TMP3]], i32 8
+; AVX-NEXT:    [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP12]], i32 8
+; AVX-NEXT:    [[TMP13:%.*]] = extractelement <16 x i16> [[TMP3]], i32 9
+; AVX-NEXT:    [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP13]], i32 9
+; AVX-NEXT:    [[TMP14:%.*]] = extractelement <16 x i16> [[TMP3]], i32 10
+; AVX-NEXT:    [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP14]], i32 10
+; AVX-NEXT:    [[TMP15:%.*]] = extractelement <16 x i16> [[TMP3]], i32 11
+; AVX-NEXT:    [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP15]], i32 11
+; AVX-NEXT:    [[TMP16:%.*]] = extractelement <16 x i16> [[TMP3]], i32 12
+; AVX-NEXT:    [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP16]], i32 12
+; AVX-NEXT:    [[TMP17:%.*]] = extractelement <16 x i16> [[TMP3]], i32 13
+; AVX-NEXT:    [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP17]], i32 13
+; AVX-NEXT:    [[TMP18:%.*]] = extractelement <16 x i16> [[TMP3]], i32 14
+; AVX-NEXT:    [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP18]], i32 14
+; AVX-NEXT:    [[TMP19:%.*]] = extractelement <16 x i16> [[TMP3]], i32 15
+; AVX-NEXT:    [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP19]], i32 15
+; AVX-NEXT:    ret <16 x i16> [[V15]]
 ;
   %p1  = getelementptr inbounds i8, i8* %p0, i64 1
   %p2  = getelementptr inbounds i8, i8* %p0, i64 2
@@ -449,26 +701,15 @@ define <16 x i16> @loadext_16i8_to_16i16(i8* %p0) {
 ;
 
 define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
-; SSE2-LABEL: @loadext_2i16_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i16_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i16_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i16_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -492,22 +733,57 @@ define <2 x i64> @loadext_2i16_to_2i64(i16* %p0) {
 }
 
 define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
-; CHECK-LABEL: @loadext_4i16_to_4i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[V3]]
+; SSE2-LABEL: @loadext_4i16_to_4i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    ret <4 x i32> [[V3]]
+;
+; SLM-LABEL: @loadext_4i16_to_4i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    ret <4 x i32> [[V3]]
+;
+; AVX-LABEL: @loadext_4i16_to_4i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <4 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    ret <4 x i32> [[V3]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -528,40 +804,23 @@ define <4 x i32> @loadext_4i16_to_4i32(i16* %p0) {
 }
 
 define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
-; SSE2-LABEL: @loadext_4i16_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i16_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <4 x i16>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i16_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i16_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
@@ -635,34 +894,97 @@ define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) {
 }
 
 define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
-; CHECK-LABEL: @loadext_8i16_to_8i32(
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
-; CHECK-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
-; CHECK-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
-; CHECK-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
-; CHECK-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
-; CHECK-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
-; CHECK-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[V7]]
+; SSE2-LABEL: @loadext_8i16_to_8i32(
+; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SSE2-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SSE2-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SSE2-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SSE2-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SSE2-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; SSE2-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; SSE2-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; SSE2-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; SSE2-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; SSE2-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; SSE2-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; SSE2-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; SSE2-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; SSE2-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; SSE2-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; SSE2-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; SSE2-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; SSE2-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; SSE2-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; SSE2-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; SSE2-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; SSE2-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; SSE2-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; SSE2-NEXT:    ret <8 x i32> [[V7]]
+;
+; SLM-LABEL: @loadext_8i16_to_8i32(
+; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; SLM-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; SLM-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; SLM-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; SLM-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; SLM-NEXT:    [[I0:%.*]] = load i16, i16* [[P0]], align 1
+; SLM-NEXT:    [[I1:%.*]] = load i16, i16* [[P1]], align 1
+; SLM-NEXT:    [[I2:%.*]] = load i16, i16* [[P2]], align 1
+; SLM-NEXT:    [[I3:%.*]] = load i16, i16* [[P3]], align 1
+; SLM-NEXT:    [[I4:%.*]] = load i16, i16* [[P4]], align 1
+; SLM-NEXT:    [[I5:%.*]] = load i16, i16* [[P5]], align 1
+; SLM-NEXT:    [[I6:%.*]] = load i16, i16* [[P6]], align 1
+; SLM-NEXT:    [[I7:%.*]] = load i16, i16* [[P7]], align 1
+; SLM-NEXT:    [[X0:%.*]] = zext i16 [[I0]] to i32
+; SLM-NEXT:    [[X1:%.*]] = zext i16 [[I1]] to i32
+; SLM-NEXT:    [[X2:%.*]] = zext i16 [[I2]] to i32
+; SLM-NEXT:    [[X3:%.*]] = zext i16 [[I3]] to i32
+; SLM-NEXT:    [[X4:%.*]] = zext i16 [[I4]] to i32
+; SLM-NEXT:    [[X5:%.*]] = zext i16 [[I5]] to i32
+; SLM-NEXT:    [[X6:%.*]] = zext i16 [[I6]] to i32
+; SLM-NEXT:    [[X7:%.*]] = zext i16 [[I7]] to i32
+; SLM-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[X0]], i32 0
+; SLM-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[X1]], i32 1
+; SLM-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[X2]], i32 2
+; SLM-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[X3]], i32 3
+; SLM-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[X4]], i32 4
+; SLM-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[X5]], i32 5
+; SLM-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[X6]], i32 6
+; SLM-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[X7]], i32 7
+; SLM-NEXT:    ret <8 x i32> [[V7]]
+;
+; AVX-LABEL: @loadext_8i16_to_8i32(
+; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1
+; AVX-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2
+; AVX-NEXT:    [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3
+; AVX-NEXT:    [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4
+; AVX-NEXT:    [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5
+; AVX-NEXT:    [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6
+; AVX-NEXT:    [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast i16* [[P0]] to <8 x i16>*
+; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 1
+; AVX-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
+; AVX-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[TMP3]], i32 0
+; AVX-NEXT:    [[V0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP4]], i32 0
+; AVX-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i32 1
+; AVX-NEXT:    [[V1:%.*]] = insertelement <8 x i32> [[V0]], i32 [[TMP5]], i32 1
+; AVX-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 2
+; AVX-NEXT:    [[V2:%.*]] = insertelement <8 x i32> [[V1]], i32 [[TMP6]], i32 2
+; AVX-NEXT:    [[TMP7:%.*]] = extractelement <8 x i32> [[TMP3]], i32 3
+; AVX-NEXT:    [[V3:%.*]] = insertelement <8 x i32> [[V2]], i32 [[TMP7]], i32 3
+; AVX-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP3]], i32 4
+; AVX-NEXT:    [[V4:%.*]] = insertelement <8 x i32> [[V3]], i32 [[TMP8]], i32 4
+; AVX-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[TMP3]], i32 5
+; AVX-NEXT:    [[V5:%.*]] = insertelement <8 x i32> [[V4]], i32 [[TMP9]], i32 5
+; AVX-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP3]], i32 6
+; AVX-NEXT:    [[V6:%.*]] = insertelement <8 x i32> [[V5]], i32 [[TMP10]], i32 6
+; AVX-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7
+; AVX-NEXT:    [[V7:%.*]] = insertelement <8 x i32> [[V6]], i32 [[TMP11]], i32 7
+; AVX-NEXT:    ret <8 x i32> [[V7]]
 ;
   %p1 = getelementptr inbounds i16, i16* %p0, i64 1
   %p2 = getelementptr inbounds i16, i16* %p0, i64 2
@@ -703,26 +1025,15 @@ define <8 x i32> @loadext_8i16_to_8i32(i16* %p0) {
 ;
 
 define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
-; SSE2-LABEL: @loadext_2i32_to_2i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    ret <2 x i64> [[V1]]
-;
-; SLM-LABEL: @loadext_2i32_to_2i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    ret <2 x i64> [[V1]]
+; SSE-LABEL: @loadext_2i32_to_2i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <2 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <2 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    ret <2 x i64> [[V1]]
 ;
 ; AVX-LABEL: @loadext_2i32_to_2i64(
 ; AVX-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
@@ -746,40 +1057,23 @@ define <2 x i64> @loadext_2i32_to_2i64(i32* %p0) {
 }
 
 define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) {
-; SSE2-LABEL: @loadext_4i32_to_4i64(
-; SSE2-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SSE2-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SSE2-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SSE2-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
-; SSE2-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
-; SSE2-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
-; SSE2-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
-; SSE2-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
-; SSE2-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
-; SSE2-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
-; SSE2-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
-; SSE2-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
-; SSE2-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
-; SSE2-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
-; SSE2-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
-; SSE2-NEXT:    ret <4 x i64> [[V3]]
-;
-; SLM-LABEL: @loadext_4i32_to_4i64(
-; SLM-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
-; SLM-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
-; SLM-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
-; SLM-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P0]] to <4 x i32>*
-; SLM-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 1
-; SLM-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; SLM-NEXT:    [[TMP4:%.*]] = extractelement <4 x i64> [[TMP3]], i32 0
-; SLM-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0
-; SLM-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP3]], i32 1
-; SLM-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1
-; SLM-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP3]], i32 2
-; SLM-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP6]], i32 2
-; SLM-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP3]], i32 3
-; SLM-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP7]], i32 3
-; SLM-NEXT:    ret <4 x i64> [[V3]]
+; SSE-LABEL: @loadext_4i32_to_4i64(
+; SSE-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
+; SSE-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2
+; SSE-NEXT:    [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3
+; SSE-NEXT:    [[I0:%.*]] = load i32, i32* [[P0]], align 1
+; SSE-NEXT:    [[I1:%.*]] = load i32, i32* [[P1]], align 1
+; SSE-NEXT:    [[I2:%.*]] = load i32, i32* [[P2]], align 1
+; SSE-NEXT:    [[I3:%.*]] = load i32, i32* [[P3]], align 1
+; SSE-NEXT:    [[X0:%.*]] = zext i32 [[I0]] to i64
+; SSE-NEXT:    [[X1:%.*]] = zext i32 [[I1]] to i64
+; SSE-NEXT:    [[X2:%.*]] = zext i32 [[I2]] to i64
+; SSE-NEXT:    [[X3:%.*]] = zext i32 [[I3]] to i64
+; SSE-NEXT:    [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3
+; SSE-NEXT:    ret <4 x i64> [[V3]]
 ;
 ; AVX1-LABEL: @loadext_4i32_to_4i64(
 ; AVX1-NEXT:    [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1
diff --git a/llvm/test/Transforms/SROA/tbaa-struct.ll b/llvm/test/Transforms/SROA/tbaa-struct.ll
new file mode 100644
index 0000000000000..d59e67a4cf34a
--- /dev/null
+++ b/llvm/test/Transforms/SROA/tbaa-struct.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -sroa %s | FileCheck %s
+
+; SROA should keep `!tbaa.struct` metadata
+
+%vector = type { float, float }
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* writeonly, i8* readonly, i64, i1 immarg)
+declare <2 x float> @foo(%vector* %0)
+
+define void @bar(%vector* %y2) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[X14:%.*]] = call <2 x float> @foo(%vector* [[Y2:%.*]])
+; CHECK-NEXT:    [[X7_SROA_0_0_X18_SROA_CAST:%.*]] = bitcast %vector* [[Y2]] to <2 x float>*
+; CHECK-NEXT:    store <2 x float> [[X14]], <2 x float>* [[X7_SROA_0_0_X18_SROA_CAST]], align 4
+; CHECK-NEXT:    ret void
+;
+  %x7 = alloca %vector
+  %x14 = call <2 x float> @foo(%vector* %y2)
+  %x15 = bitcast %vector* %x7 to <2 x float>*
+  store <2 x float> %x14, <2 x float>* %x15
+  %x19 = bitcast %vector* %x7 to i8*
+  %x18 = bitcast %vector* %y2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %x18, i8* align 4 %x19, i64 8, i1 false), !tbaa.struct !10
+  ret void
+}
+
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!7 = !{!"vector", !8, i64 0, !8, i64 4}
+!8 = !{!"float", !4, i64 0}
+!10 = !{i64 0, i64 4, !11, i64 4, i64 4, !11}
+!11 = !{!8, !8, i64 0}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/preserve-scev-exiting-multiple-loops.ll b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-scev-exiting-multiple-loops.ll
new file mode 100644
index 0000000000000..badb623ff59a0
--- /dev/null
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/preserve-scev-exiting-multiple-loops.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; We run -indvars before -simple-loop-unswitch to compute SCEV exit counts before
+; running -simple-loop-unswitch.
+; RUN:  opt -indvars -simple-loop-unswitch -S %s -verify-scev | FileCheck %s
+
+; Test for PR43972.
+
+; We have a 3 nested loops (l1 <- l2 <- l3). %for.cond.5 is the exit block of
+; l3 and the loop for it is l2. But it is also the exiting block of l1. That
+; means we have to invalidate l1 to preserve SCEV.
+
+define void @f() {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LNOT:%.*]] = xor i1 undef, true
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond1:
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY]], label [[FOR_COND_LOOPEXIT:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    br i1 [[LNOT]], label [[FOR_BODY_SPLIT:%.*]], label [[FOR_COND5_SPLIT:%.*]]
+; CHECK:       for.body.split:
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       for.cond2:
+; CHECK-NEXT:    br i1 true, label [[LAND_RHS]], label [[FOR_COND5:%.*]]
+; CHECK:       land.rhs:
+; CHECK-NEXT:    br label [[FOR_COND2:%.*]]
+; CHECK:       for.cond5:
+; CHECK-NEXT:    br label [[FOR_COND5_SPLIT]]
+; CHECK:       for.cond5.split:
+; CHECK-NEXT:    br i1 true, label [[FOR_BODY7:%.*]], label [[FOR_COND1:%.*]]
+; CHECK:       for.body7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %lnot = xor i1 undef, true
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  br label %for.body
+
+for.cond1:                                        ; preds = %for.cond5
+  br i1 true, label %for.body, label %for.cond
+
+for.body:                                         ; preds = %for.cond1, %for.cond
+  br label %land.rhs
+
+for.cond2:                                        ; preds = %land.rhs
+  br i1 true, label %land.rhs, label %for.cond5
+
+land.rhs:                                         ; preds = %for.cond2, %for.body
+  br i1 %lnot, label %for.cond2, label %for.cond5
+
+for.cond5:                                        ; preds = %land.rhs, %for.cond2
+  br i1 true, label %for.body7, label %for.cond1
+
+for.body7:                                        ; preds = %for.cond5
+  ret void
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/calls.ll b/llvm/test/Transforms/TypePromotion/ARM/calls.ll
new file mode 100644
index 0000000000000..cd273c06150f5
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/calls.ll
@@ -0,0 +1,342 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define i8 @call_with_imms(i8* %arg) {
+; CHECK-LABEL: @call_with_imms(
+; CHECK-NEXT:    [[CALL:%.*]] = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull [[ARG:%.*]], i8 zeroext 0, i8 zeroext 0)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[CALL]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i8 [[CALL]], i8 1
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %call = tail call arm_aapcs_vfpcc zeroext i8 @dummy2(i8* nonnull %arg, i8 zeroext 0, i8 zeroext 0)
+  %cmp = icmp eq i8 %call, 0
+  %res = select i1 %cmp, i8 %call, i8 1
+  ret i8 %res
+}
+
+define i16 @test_call(i8 zeroext %arg) {
+; CHECK-LABEL: @test_call(
+; CHECK-NEXT:    [[CALL:%.*]] = call i8 @dummy_i8(i8 [[ARG:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[CALL]], -128
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %call = call i8 @dummy_i8(i8 %arg)
+  %cmp = icmp ult i8 %call, 128
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i16 @promote_i8_sink_i16_1(i8 zeroext %arg0, i16 zeroext %arg1, i16 zeroext %arg2) {
+; CHECK-LABEL: @promote_i8_sink_i16_1(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[ARG2:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i8 @dummy_i8(i8 [[ARG0:%.*]])
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[CALL]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP3]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[ADD]], [[TMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[TMP2]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SEL]] to i16
+; CHECK-NEXT:    [[RES:%.*]] = tail call zeroext i16 @dummy3(i16 [[TMP4]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[RES]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
+; CHECK-NEXT:    ret i16 [[TMP6]]
+;
+  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+  %add = add nuw i8 %call, 1
+  %conv = zext i8 %add to i16
+  %cmp = icmp ne i16 %conv, %arg1
+  %sel = select i1 %cmp, i16 %arg1, i16 %arg2
+  %res = tail call zeroext i16 @dummy3(i16 %sel)
+  ret i16 %res
+}
+
+define i16 @promote_i8_sink_i16_2(i8 zeroext %arg0, i8 zeroext %arg1, i16 zeroext %arg2) {
+; CHECK-LABEL: @promote_i8_sink_i16_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i8 @dummy_i8(i8 [[ARG0:%.*]])
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[CALL]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP2]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP3]] to i16
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i16 [[CONV]], i16 [[ARG2:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call zeroext i16 @dummy3(i16 [[SEL]])
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %call = tail call zeroext i8 @dummy_i8(i8 %arg0)
+  %add = add nuw i8 %call, 1
+  %cmp = icmp ne i8 %add, %arg1
+  %conv = zext i8 %arg1 to i16
+  %sel = select i1 %cmp, i16 %conv, i16 %arg2
+  %res = tail call zeroext i16 @dummy3(i16 %sel)
+  ret i16 %res
+}
+
+@uc = global i8 42, align 1
+@LL = global i64 0, align 8
+
+define void @zext_i64() {
+; CHECK-LABEL: @zext_i64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* @uc, align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i64
+; CHECK-NEXT:    store i64 [[CONV]], i64* @LL, align 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], 42
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 [[CONV1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i8, i8* @uc, align 1
+  %conv = zext i8 %0 to i64
+  store i64 %conv, i64* @LL, align 8
+  %cmp = icmp eq i8 %0, 42
+  %conv1 = zext i1 %cmp to i32
+  %call = tail call i32 bitcast (i32 (...)* @assert to i32 (i32)*)(i32 %conv1)
+  ret void
+}
+
+@a = global i16* null, align 4
+@b = global i32 0, align 4
+
+define i32 @constexpr() {
+; CHECK-LABEL: @constexpr(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
+; CHECK-NEXT:    [[OR:%.*]] = or i16 [[TMP1]], ptrtoint (i32* @b to i16)
+; CHECK-NEXT:    store i16 [[OR]], i16* [[TMP0]], align 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i16 [[OR]], 4
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 [[CONV3]])
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  store i32 ptrtoint (i32* @b to i32), i32* @b, align 4
+  %0 = load i16*, i16** @a, align 4
+  %1 = load i16, i16* %0, align 2
+  %or = or i16 %1, ptrtoint (i32* @b to i16)
+  store i16 %or, i16* %0, align 2
+  %cmp = icmp ne i16 %or, 4
+  %conv3 = zext i1 %cmp to i32
+  %call = tail call i32 bitcast (i32 (...)* @e to i32 (i32)*)(i32 %conv3) #2
+  ret i32 undef
+}
+
+define fastcc i32 @call_zext_i8_i32(i32 %p_45, i8 zeroext %p_46) {
+; CHECK-LABEL: @call_zext_i8_i32(
+; CHECK-NEXT:  for.cond8.preheader:
+; CHECK-NEXT:    [[CALL217:%.*]] = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
+; CHECK-NEXT:    [[TOBOOL219:%.*]] = icmp eq i8 [[CALL217]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL219]], label [[FOR_END411:%.*]], label [[FOR_COND273_PREHEADER:%.*]]
+; CHECK:       for.cond273.preheader:
+; CHECK-NEXT:    [[CALL217_LCSSA:%.*]] = phi i8 [ [[CALL217]], [[FOR_COND8_PREHEADER:%.*]] ]
+; CHECK-NEXT:    [[CONV218_LE:%.*]] = zext i8 [[CALL217_LCSSA]] to i32
+; CHECK-NEXT:    [[CALL346:%.*]] = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext [[CALL217_LCSSA]], i32 [[CONV218_LE]])
+; CHECK-NEXT:    unreachable
+; CHECK:       for.end411:
+; CHECK-NEXT:    [[CALL452:%.*]] = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4)
+; CHECK-NEXT:    unreachable
+;
+for.cond8.preheader:
+  %call217 = call fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 zeroext undef)
+  %tobool219 = icmp eq i8 %call217, 0
+  br i1 %tobool219, label %for.end411, label %for.cond273.preheader
+
+for.cond273.preheader:                            ; preds = %for.cond8.preheader
+  %call217.lcssa = phi i8 [ %call217, %for.cond8.preheader ]
+  %conv218.le = zext i8 %call217.lcssa to i32
+  %call346 = call fastcc zeroext i8 @safe_lshift_func(i8 zeroext %call217.lcssa, i32 %conv218.le)
+  unreachable
+
+for.end411:                                       ; preds = %for.cond8.preheader
+  %call452 = call fastcc i64 @safe_sub_func_int64_t_s_s(i64 undef, i64 4)
+  unreachable
+}
+
+%struct.anon = type { i32 }
+
+@g_57 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_893 = hidden local_unnamed_addr global %struct.anon zeroinitializer, align 4
+@g_82 = hidden local_unnamed_addr global i32 0, align 4
+
+define hidden i32 @call_return_pointer(i8 zeroext %p_13) local_unnamed_addr #0 {
+; CHECK-LABEL: @call_return_pointer(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[P_13:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP1]] to i16
+; CHECK-NEXT:    [[CALL:%.*]] = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext [[CONV1]], i32* undef)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
+; CHECK-NEXT:    [[CONV2:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[P_13_ADDR_0:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[P_13_ADDR_0_BE:%.*]], [[FOR_COND_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[P_13_ADDR_0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_COND_BACKEDGE]], label [[IF_THEN:%.*]]
+; CHECK:       for.cond.backedge:
+; CHECK-NEXT:    [[P_13_ADDR_0_BE]] = phi i32 [ [[TMP3:%.*]], [[IF_THEN]] ], [ 0, [[FOR_COND]] ]
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CALL3:%.*]] = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext [[CONV2]])
+; CHECK-NEXT:    [[CONV4:%.*]] = trunc i16 [[CALL3]] to i8
+; CHECK-NEXT:    [[TMP3]] = zext i8 [[CONV4]] to i32
+; CHECK-NEXT:    br label [[FOR_COND_BACKEDGE]]
+;
+entry:
+  %conv1 = zext i8 %p_13 to i16
+  %call = tail call i16** @func_62(i8 zeroext undef, i32 undef, i16 signext %conv1, i32* undef)
+  %0 = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @g_893, i32 0, i32 0), align 4
+  %conv2 = trunc i32 %0 to i16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.backedge, %entry
+  %p_13.addr.0 = phi i8 [ %p_13, %entry ], [ %p_13.addr.0.be, %for.cond.backedge ]
+  %tobool = icmp eq i8 %p_13.addr.0, 0
+  br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge:                                ; preds = %for.cond, %if.then
+  %p_13.addr.0.be = phi i8 [ %conv4, %if.then ], [ 0, %for.cond ]
+  br label %for.cond
+
+if.then:                                          ; preds = %for.cond
+  %call3 = tail call fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %conv2)
+  %conv4 = trunc i16 %call3 to i8
+  br label %for.cond.backedge
+}
+
+define i32 @check_zext_phi_call_arg() {
+; CHECK-LABEL: @check_zext_phi_call_arg(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[D_SROA_0_0:%.*]] = phi i32 [ 30, [[ENTRY:%.*]] ], [ [[D_SROA_0_0_BE:%.*]], [[FOR_COND_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[D_SROA_0_0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[FOR_COND_BACKEDGE]], label [[IF_THEN:%.*]]
+; CHECK:       for.cond.backedge:
+; CHECK-NEXT:    [[D_SROA_0_0_BE]] = phi i32 [ [[TMP1:%.*]], [[IF_THEN]] ], [ 0, [[FOR_COND]] ]
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[D_SROA_0_0]] to i16
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 [[D_SROA_0_0]])
+; CHECK-NEXT:    [[TMP1]] = zext i16 [[CALL]] to i32
+; CHECK-NEXT:    br label [[FOR_COND_BACKEDGE]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond.backedge, %entry
+  %d.sroa.0.0 = phi i16 [ 30, %entry ], [ %d.sroa.0.0.be, %for.cond.backedge ]
+  %tobool = icmp eq i16 %d.sroa.0.0, 0
+  br i1 %tobool, label %for.cond.backedge, label %if.then
+
+for.cond.backedge:                                ; preds = %for.cond, %if.then
+  %d.sroa.0.0.be = phi i16 [ %call, %if.then ], [ 0, %for.cond ]
+  br label %for.cond
+
+if.then:                                          ; preds = %for.cond
+  %d.sroa.0.0.insert.ext = zext i16 %d.sroa.0.0 to i32
+  %call = tail call zeroext i16 bitcast (i16 (...)* @f to i16 (i32)*)(i32 %d.sroa.0.0.insert.ext) #2
+  br label %for.cond.backedge
+}
+
+%struct.atomic_flag = type { i8 }
+
+define zeroext i1 @atomic_flag_test_and_set(%struct.atomic_flag* %object) {
+; CHECK-LABEL: @atomic_flag_test_and_set(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[_VALUE:%.*]] = getelementptr inbounds [[STRUCT_ATOMIC_FLAG:%.*]], %struct.atomic_flag* [[OBJECT:%.*]], i32 0, i32 0
+; CHECK-NEXT:    [[CALL:%.*]] = tail call arm_aapcscc zeroext i8 @__atomic_exchange_1(i8* [[_VALUE]], i8 zeroext 1, i32 5)
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[CALL]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 1
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+entry:
+  %_Value = getelementptr inbounds %struct.atomic_flag, %struct.atomic_flag* %object, i32 0, i32 0
+  %call = tail call arm_aapcscc zeroext i8 @__atomic_exchange_1(i8* %_Value, i8 zeroext 1, i32 5) #1
+  %0 = and i8 %call, 1
+  %tobool = icmp ne i8 %0, 0
+  ret i1 %tobool
+}
+
+define i1 @i1_zeroext_call(i16* %ts, i32 %a, i16* %b, i8* %c) {
+; CHECK-LABEL: @i1_zeroext_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[TS:%.*]], align 2
+; CHECK-NEXT:    [[CONV_I860:%.*]] = trunc i32 [[A:%.*]] to i16
+; CHECK-NEXT:    store i16 [[CONV_I860]], i16* [[B:%.*]], align 2
+; CHECK-NEXT:    [[CALL_I848:%.*]] = call zeroext i1 @i1_zeroext(i8* [[C:%.*]], i32 64, i16 zeroext [[CONV_I860]])
+; CHECK-NEXT:    br i1 [[CALL_I848]], label [[IF_THEN223:%.*]], label [[IF_ELSE227:%.*]]
+; CHECK:       if.then223:
+; CHECK-NEXT:    [[CMP235:%.*]] = icmp eq i16 [[TMP0]], [[CONV_I860]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       if.else227:
+; CHECK-NEXT:    [[CMP236:%.*]] = icmp ult i16 [[TMP0]], [[CONV_I860]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i1 [ [[CMP235]], [[IF_THEN223]] ], [ [[CMP236]], [[IF_ELSE227]] ]
+; CHECK-NEXT:    ret i1 [[RETVAL]]
+;
+entry:
+  %0 = load i16, i16* %ts, align 2
+  %conv.i860 = trunc i32 %a to i16
+  store i16 %conv.i860, i16* %b, align 2
+  %call.i848 = call zeroext i1 @i1_zeroext(i8* %c, i32 64, i16 zeroext %conv.i860)
+  br i1 %call.i848, label %if.then223, label %if.else227
+
+if.then223:
+  %cmp235 = icmp eq i16 %0, %conv.i860
+  br label %exit
+
+if.else227:
+  %cmp236 = icmp ult i16 %0, %conv.i860
+  br label %exit
+
+exit:
+  %retval = phi i1 [ %cmp235, %if.then223 ], [ %cmp236, %if.else227 ]
+  ret i1 %retval
+}
+
+define i16 @promote_arg_pass_to_call(i16 zeroext %arg1, i16 zeroext %arg2) {
+; CHECK-LABEL: @promote_arg_pass_to_call(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[ARG2:%.*]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = add nuw i32 [[TMP1]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[MUL]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    [[RES:%.*]] = call zeroext i16 @dummy4(i1 [[CMP]], i8 [[TMP4]], i16 [[TMP5]])
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[RES]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i16
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %conv = add nuw i16 %arg1, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %cmp = icmp ult i16 %mul, %arg2
+  %trunc = trunc i16 %arg1 to i8
+  %res = call zeroext i16 @dummy4(i1 %cmp, i8 %trunc, i16 %arg1)
+  ret i16 %res
+}
+
+
+declare i32 @assert(...)
+declare i8 @dummy_i8(i8)
+declare i8 @dummy2(i8*, i8, i8)
+declare i16 @dummy3(i16)
+declare i16 @dummy4(i1, i8, i16)
+
+declare dso_local i32 @e(...) local_unnamed_addr #1
+declare dso_local zeroext i16 @f(...) local_unnamed_addr #1
+declare dso_local arm_aapcscc i8 @__atomic_exchange_1(i8*, i8, i32) local_unnamed_addr
+
+declare noalias i16** @func_62(i8 zeroext %p_63, i32 %p_64, i16 signext %p_65, i32* nocapture readnone %p_66)
+declare fastcc signext i16 @safe_sub_func_int16_t_s_s(i16 signext %si2)
+declare dso_local fastcc i64 @safe_sub_func_int64_t_s_s(i64, i64)
+declare dso_local fastcc zeroext i8 @safe_lshift_func(i8 zeroext, i32)
+declare dso_local fastcc zeroext i8 @safe_mul_func_uint8_t_u_u(i8 returned zeroext)
+declare i1 @i1_zeroext(i8*, i32, i16 zeroext)
diff --git a/llvm/test/Transforms/TypePromotion/ARM/casts.ll b/llvm/test/Transforms/TypePromotion/ARM/casts.ll
new file mode 100644
index 0000000000000..70fa617115e86
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/casts.ll
@@ -0,0 +1,1072 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define i16 @dsp_trunc(i32 %arg0, i32 %arg1, i16* %gep0, i16* %gep1) {
+; CHECK-LABEL: @dsp_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD0:%.*]] = add i32 [[ARG0:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[CONV0:%.*]] = trunc i32 [[ADD0]] to i16
+; CHECK-NEXT:    [[SUB0:%.*]] = sub i16 0, [[CONV0]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i16, i16* [[GEP0:%.*]], align 2
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i16, i16* [[GEP1:%.*]], align 2
+; CHECK-NEXT:    [[SUB1:%.*]] = sub i16 [[LOAD0]], [[SUB0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i16 [[LOAD1]], [[SUB0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[SUB1]], [[ADD1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 [[ADD1]], i16 [[SUB1]]
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+entry:
+  %add0 = add i32 %arg0, %arg1
+  %conv0 = trunc i32 %add0 to i16
+  %sub0 = sub i16 0, %conv0
+  %load0 = load i16, i16* %gep0, align 2
+  %load1 = load i16, i16* %gep1, align 2
+  %sub1 = sub i16 %load0, %sub0
+  %add1 = add i16 %load1, %sub0
+  %cmp = icmp ult i16 %sub1, %add1
+  %res = select i1 %cmp, i16 %add1, i16 %sub1
+  ret i16 %res
+}
+
+define i8 @trunc_i16_i8(i16* %ptr, i16 zeroext %arg0, i8 zeroext %arg1) {
+; CHECK-LABEL: @trunc_i16_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[TMP1]], [[ARG0:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i16 [[TMP2]] to i8
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP4]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
+; CHECK-NEXT:    ret i8 [[TMP7]]
+;
+entry:
+  %0 = load i16, i16* %ptr
+  %1 = add i16 %0, %arg0
+  %2 = trunc i16 %1 to i8
+  %3 = icmp ugt i8 %2, %arg1
+  %4 = select i1 %3, i8 %2, i8 %arg1
+  ret i8 %4
+}
+
+; The pass perform the transform, but a uxtb will still be inserted to handle
+; the zext to the icmp.
+define i8 @icmp_i32_zext(i8* %ptr) {
+; CHECK-LABEL: @icmp_i32_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[GEP]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nuw nsw i8 [[TMP0]], 1
+; CHECK-NEXT:    [[CONV44:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i8 [ [[TMP1]], [[PREHEADER]] ], [ [[TMP3:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[SI_0274:%.*]] = phi i32 [ [[CONV44]], [[PREHEADER]] ], [ [[INC:%.*]], [[IF_END]] ]
+; CHECK-NEXT:    [[CONV51266:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[CMP52267:%.*]] = icmp eq i32 [[SI_0274]], [[CONV51266]]
+; CHECK-NEXT:    br i1 [[CMP52267]], label [[IF_END]], label [[EXIT:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC]] = add i32 [[SI_0274]], 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i32 [[INC]]
+; CHECK-NEXT:    [[TMP3]] = load i8, i8* [[GEP1]], align 1
+; CHECK-NEXT:    br label [[BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i8 [[TMP2]]
+;
+entry:
+  %gep = getelementptr inbounds i8, i8* %ptr, i32 0
+  %0 = load i8, i8* %gep, align 1
+  %1 = sub nuw nsw i8 %0, 1
+  %conv44 = zext i8 %0 to i32
+  br label %preheader
+
+preheader:
+  br label %body
+
+body:
+  %2 = phi i8 [ %1, %preheader ], [ %3, %if.end ]
+  %si.0274 = phi i32 [ %conv44, %preheader ], [ %inc, %if.end ]
+  %conv51266 = zext i8 %2 to i32
+  %cmp52267 = icmp eq i32 %si.0274, %conv51266
+  br i1 %cmp52267, label %if.end, label %exit
+
+if.end:
+  %inc = add i32 %si.0274, 1
+  %gep1 = getelementptr inbounds i8, i8* %ptr, i32 %inc
+  %3 = load i8, i8* %gep1, align 1
+  br label %body
+
+exit:
+  ret i8 %2
+}
+
+; Won't don't handle sext
+define i32 @icmp_sext_zext_store_i8_i16() {
+; CHECK-LABEL: @icmp_sext_zext_store_i8_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i16
+; CHECK-NEXT:    store i16 [[CONV]], i16* @sh1, align 2
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV1]], [[CONV2]]
+; CHECK-NEXT:    [[CONV3:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV3]]
+;
+entry:
+  %0 = load i8, i8* getelementptr inbounds ([16 x i8], [16 x i8]* @d_uch, i32 0, i32 2), align 1
+  %conv = zext i8 %0 to i16
+  store i16 %conv, i16* @sh1, align 2
+  %conv1 = zext i8 %0 to i32
+  %1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @d_sh, i32 0, i32 2), align 2
+  %conv2 = sext i16 %1 to i32
+  %cmp = icmp eq i32 %conv1, %conv2
+  %conv3 = zext i1 %cmp to i32
+  ret i32 %conv3
+}
+
+define i1 @or_icmp_ugt(i32 %arg, i8* %ptr) {
+; CHECK-LABEL: @or_icmp_ugt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[TMP2]], 1
+; CHECK-NEXT:    [[ADD0:%.*]] = add nuw nsw i32 [[MUL]], 6
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ne i32 [[ARG:%.*]], [[ADD0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[TMP3]], 3
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP0]], [[CMP1]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+entry:
+  %0 = load i8, i8* %ptr
+  %1 = zext i8 %0 to i32
+  %mul = shl nuw nsw i32 %1, 1
+  %add0 = add nuw nsw i32 %mul, 6
+  %cmp0 = icmp ne i32 %arg, %add0
+  %add1 = add i8 %0, -1
+  %cmp1 = icmp ugt i8 %add1, 3
+  %or = or i1 %cmp0, %cmp1
+  ret i1 %or
+}
+
+; We currently only handle truncs as sinks, so a uxt will still be needed for
+; the icmp ugt instruction.
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+; CHECK-LABEL: @urem_trunc_icmps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = load i16*, i16** [[IN:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i16, i16* [[PTR]], align 2
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 [[LD]], 0
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[EXIT:%.*]], label [[COND_FALSE_I:%.*]]
+; CHECK:       cond.false.i:
+; CHECK-NEXT:    [[REM:%.*]] = urem i16 5, [[LD]]
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i16 [[REM]] to i8
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[COND_IN_I_OFF0:%.*]] = phi i8 [ [[EXTRACT_T]], [[COND_FALSE_I]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[COND_IN_I_OFF0]], 7
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[CONV5]], i32* [[G:%.*]], align 4
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, i32* [[K:%.*]], align 4
+; CHECK-NEXT:    [[TOBOOL13150:%.*]] = icmp eq i32 [[DOTPR]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL13150]], label [[FOR_INC]], label [[EXIT]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[ADD]] = add nuw i8 [[COND_IN_I_OFF0]], 1
+; CHECK-NEXT:    br label [[BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr = load i16*, i16** %in, align 4
+  %ld = load i16, i16* %ptr, align 2
+  %cmp.i = icmp eq i16 %ld, 0
+  br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+  %rem = urem i16 5, %ld
+  %extract.t = trunc i16 %rem to i8
+  br label %body
+
+body:
+  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+  %cmp = icmp ugt i8 %cond.in.i.off0, 7
+  %conv5 = zext i1 %cmp to i32
+  store i32 %conv5, i32* %g, align 4
+  %.pr = load i32, i32* %k, align 4
+  %tobool13150 = icmp eq i32 %.pr, 0
+  br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+  %add = add nuw i8 %cond.in.i.off0, 1
+  br label %body
+
+exit:
+  ret void
+}
+
+; Check that %exp requires uxth in all cases, and will also be required to
+; promote %1 for the call - unless we can generate a uadd16.
+define i32 @zext_load_sink_call(i16* %ptr, i16 %exp) {
+; CHECK-LABEL: @zext_load_sink_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[EXP:%.*]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP0]], [[EXP]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CONV0:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[CALL:%.*]] = tail call arm_aapcs_vfpcc i32 @dummy(i32 [[CONV0]], i32 [[CONV1]])
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[EXITVAL:%.*]] = phi i32 [ [[CALL]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[EXITVAL]]
+;
+entry:
+  %0 = load i16, i16* %ptr, align 4
+  %1 = add i16 %exp, 3
+  %cmp = icmp eq i16 %0, %exp
+  br i1 %cmp, label %exit, label %if.then
+
+if.then:
+  %conv0 = zext i16 %0 to i32
+  %conv1 = zext i16 %1 to i32
+  %call = tail call arm_aapcs_vfpcc i32 @dummy(i32 %conv0, i32 %conv1)
+  br label %exit
+
+exit:
+  %exitval = phi i32 [ %call, %if.then ], [ 0, %entry  ]
+  ret i32 %exitval
+}
+
+define i16 @bitcast_i16(i16 zeroext %arg0, i16 zeroext %arg1) {
+entry:
+  %cast = bitcast i16 12345 to i16
+  %add = add nuw i16 %arg0, 1
+  %cmp = icmp ule i16 %add, %cast
+  %res = select i1 %cmp, i16 %arg1, i16 32657
+  ret i16 %res
+}
+
+define i8 @bitcast_i8(i8 zeroext %arg0, i8 zeroext %arg1) {
+; CHECK-LABEL: @bitcast_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8 127 to i8
+; CHECK-NEXT:    [[MUL:%.*]] = shl nuw i32 [[TMP0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[MUL]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i8 [[CAST]], i8 -128
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+entry:
+  %cast = bitcast i8 127 to i8
+  %mul = shl nuw i8 %arg0, 1
+  %cmp = icmp uge i8 %mul, %arg1
+  %res = select i1 %cmp, i8 %cast, i8 128
+  ret i8 %res
+}
+
+define i16 @bitcast_i16_minus(i16 zeroext %arg0, i16 zeroext %arg1) {
+; CHECK-LABEL: @bitcast_i16_minus(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i16 -12345 to i16
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP0]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[XOR]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 [[CAST]], i16 32657
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+entry:
+  %cast = bitcast i16 -12345 to i16
+  %xor = xor i16 %arg0, 7
+  %cmp = icmp eq i16 %xor, %arg1
+  %res = select i1 %cmp, i16 %cast, i16 32657
+  ret i16 %res
+}
+
+define i8 @bitcast_i8_minus(i8 zeroext %arg0, i8 zeroext %arg1) {
+; CHECK-LABEL: @bitcast_i8_minus(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i8 -127 to i8
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP0]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i8 [[CAST]], i8 -128
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+entry:
+  %cast = bitcast i8 -127 to i8
+  %and = and i8 %arg0, 3
+  %cmp = icmp ne i8 %and, %arg1
+  %res = select i1 %cmp, i8 %cast, i8 128
+  ret i8 %res
+}
+
+declare i32 @dummy(i32, i32)
+
+@d_uch = hidden local_unnamed_addr global [16 x i8] zeroinitializer, align 1
+@sh1 = hidden local_unnamed_addr global i16 0, align 2
+@d_sh = hidden local_unnamed_addr global [16 x i16] zeroinitializer, align 2
+
+define i8* @two_stage_zext_trunc_mix(i32* %this, i32 %__pos1, i32 %__n1, i32** %__str, i32 %__pos2, i32 %__n2) {
+; CHECK-LABEL: @two_stage_zext_trunc_mix(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[__SIZE__I_I_I_I:%.*]] = bitcast i32** [[__STR:%.*]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[__SIZE__I_I_I_I]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TOBOOL_I_I_I_I:%.*]] = icmp eq i32 [[TMP2]], 0
+; CHECK-NEXT:    [[__SIZE__I5_I_I:%.*]] = getelementptr inbounds i32*, i32** [[__STR]], i32 [[__N1:%.*]]
+; CHECK-NEXT:    [[CAST:%.*]] = bitcast i32** [[__SIZE__I5_I_I]] to i32*
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[CAST]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; CHECK-NEXT:    [[COND_I_I:%.*]] = select i1 [[TOBOOL_I_I_I_I]], i32 [[TMP4]], i32 [[TMP3]]
+; CHECK-NEXT:    [[__SIZE__I_I_I_I_I:%.*]] = bitcast i32* [[THIS:%.*]] to i8*
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[__SIZE__I_I_I_I_I]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], 1
+; CHECK-NEXT:    [[TOBOOL_I_I_I_I_I:%.*]] = icmp eq i32 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[__SIZE__I_I_I_I]], i32 [[__POS1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[__SIZE__I_I_I_I]], i32 [[__POS2:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[TOBOOL_I_I_I_I_I]], i8* [[TMP9]], i8* [[TMP10]]
+; CHECK-NEXT:    ret i8* [[RES]]
+;
+entry:
+  %__size_.i.i.i.i = bitcast i32** %__str to i8*
+  %0 = load i8, i8* %__size_.i.i.i.i, align 4
+  %1 = and i8 %0, 1
+  %tobool.i.i.i.i = icmp eq i8 %1, 0
+  %__size_.i5.i.i = getelementptr inbounds i32*, i32** %__str, i32 %__n1
+  %cast = bitcast i32** %__size_.i5.i.i to i32*
+  %2 = load i32, i32* %cast, align 4
+  %3 = lshr i8 %0, 1
+  %4 = zext i8 %3 to i32
+  %cond.i.i = select i1 %tobool.i.i.i.i, i32 %4, i32 %2
+  %__size_.i.i.i.i.i = bitcast i32* %this to i8*
+  %5 = load i8, i8* %__size_.i.i.i.i.i, align 4
+  %6 = and i8 %5, 1
+  %tobool.i.i.i.i.i = icmp eq i8 %6, 0
+  %7 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos1
+  %8 = getelementptr inbounds i8, i8* %__size_.i.i.i.i, i32 %__pos2
+  %res = select i1 %tobool.i.i.i.i.i,  i8* %7, i8* %8
+  ret i8* %res
+}
+
+define i8 @search_through_zext_1(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+; CHECK-LABEL: @search_through_zext_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[C:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[SUB]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i32 [[TMP1]], i32 [[TMP0]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RETVAL]] to i8
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp2 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here. The transform fails because
+; the icmp ugt uses an i32, which is too large... but this doesn't matter
+; because it won't be writing a large value to a register as a result.
+define i8 @search_through_zext_2(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+; CHECK-LABEL: @search_through_zext_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[ADD]] to i16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[CONV]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[B]], [[A]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[SUB]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[CONV2]], [[D:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxtb here as all the calculations are
+; performed on i8s. The promotion of i8 to i16 and then the later truncation
+; results in the uxtb.
+define i8 @search_through_zext_3(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c, i32 %d) {
+; CHECK-LABEL: @search_through_zext_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[ADD]] to i16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[CONV]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[CONV]] to i8
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[B]], [[TRUNC]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[SUB]] to i32
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[CONV2]], [[D:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %add = add nuw i8 %a, %b
+  %conv = zext i8 %add to i16
+  %cmp = icmp ult i16 %conv, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %conv to i8
+  %sub = sub nuw i8 %b, %trunc
+  %conv2 = zext i8 %sub to i32
+  %cmp2 = icmp ugt i32 %conv2, %d
+  %res = select i1 %cmp2, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+; TODO: We should be able to remove the uxt that gets introduced for %conv2
+define i8 @search_through_zext_cmp(i8 zeroext %a, i8 zeroext %b, i16 zeroext %c) {
+; CHECK-LABEL: @search_through_zext_cmp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i16 [[CONV]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i8 [[B]], [[A]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[SUB]] to i16
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ugt i16 [[CONV2]], [[C]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP3]], i8 [[A]], i8 [[B]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+entry:
+  %cmp = icmp ne i8 %a, %b
+  %conv = zext i1 %cmp to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %a
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %a, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+define i8 @search_through_zext_load(i8* %a, i8 zeroext %b, i16 zeroext %c) {
+; CHECK-LABEL: @search_through_zext_load(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[C:%.*]] to i32
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[A:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[LOAD]] to i32
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ugt i32 [[SUB]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP3]], i32 [[TMP2]], i32 [[TMP0]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RETVAL]] to i8
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+entry:
+  %load = load i8, i8* %a
+  %conv = zext i8 %load to i16
+  %cmp1 = icmp ult i16 %conv, %c
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %sub = sub nuw i8 %b, %load
+  %conv2 = zext i8 %sub to i16
+  %cmp3 = icmp ugt i16 %conv2, %c
+  %res = select i1 %cmp3, i8 %load, i8 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %res, %if.then ]
+  ret i8 %retval
+}
+
+define i16 @trunc_sink_less_than_cmp(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d) {
+; CHECK-LABEL: @trunc_sink_less_than_cmp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i16 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[SUB]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[SUB]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[D:%.*]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[TRUNC]], [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i16 [[A]], i16 [[B]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i16 [[RETVAL]]
+;
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, 1
+  %cmp2 = icmp ugt i8 %trunc, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+; TODO: We should be able to remove the uxth introduced to handle %sub
+define i16 @trunc_sink_less_than_arith(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+; CHECK-LABEL: @trunc_sink_less_than_arith(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i16 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i16 [[SUB]], [[C:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i16 [[SUB]] to i8
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i8 [[D:%.*]], [[TRUNC]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i8 [[E:%.*]], [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i16 [[A]], i16 [[B]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[RES]], [[IF_THEN]] ]
+; CHECK-NEXT:    ret i16 [[RETVAL]]
+;
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  %cmp2 = icmp ugt i8 %e, %add
+  %res = select i1 %cmp2, i16 %a, i16 %b
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %res, %if.then ]
+  ret i16 %retval
+}
+
+define i16 @trunc_sink_less_than_store(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8* %e) {
+; CHECK-LABEL: @trunc_sink_less_than_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[D:%.*]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[C:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[SUB]], 255
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[ADD]] to i8
+; CHECK-NEXT:    store i8 [[TMP5]], i8* [[E:%.*]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUB]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[RETVAL]] to i16
+; CHECK-NEXT:    ret i16 [[TMP6]]
+;
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  store i8 %add, i8* %e
+  br label %if.end
+
+if.end:
+  %retval = phi i16 [ 0, %entry ], [ %sub, %if.then ]
+  ret i16 %retval
+}
+
+define i8 @trunc_sink_less_than_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+; CHECK-LABEL: @trunc_sink_less_than_ret(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[D:%.*]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[C:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[SUB]], 255
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[RETVAL]] to i8
+; CHECK-NEXT:    ret i8 [[TMP5]]
+;
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+define zeroext i8 @trunc_sink_less_than_zext_ret(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i8 zeroext %d, i8 zeroext %e) {
+; CHECK-LABEL: @trunc_sink_less_than_zext_ret(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[D:%.*]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i16 [[C:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[SUB]], 255
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[RETVAL]] to i8
+; CHECK-NEXT:    ret i8 [[TMP5]]
+;
+entry:
+  %sub = sub nuw i16 %b, %a
+  %cmp = icmp ult i16 %sub, %c
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %trunc = trunc i16 %sub to i8
+  %add = add nuw i8 %d, %trunc
+  br label %if.end
+
+if.end:
+  %retval = phi i8 [ 0, %entry ], [ %add, %if.then ]
+  ret i8 %retval
+}
+
+define i32 @bitcast_i1(i16 zeroext %a, i32 %b, i32 %c) {
+; CHECK-LABEL: @bitcast_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 true to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[A:%.*]] to i1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i1 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[EXIT:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i16
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i1 [[TMP1]] to i16
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp uge i16 [[CONV]], [[CONV1]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP1]], i32 [[B:%.*]], i32 [[C:%.*]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i32 [ [[SELECT]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL]]
+;
+entry:
+  %0 = bitcast i1 1 to i1
+  %1 = trunc i16 %a to i1
+  %cmp = icmp eq i1 %1, %0
+  br i1 %cmp, label %if.then, label %exit
+
+if.then:
+  %conv = zext i1 %0 to i16
+  %conv1 = zext i1 %1 to i16
+  %cmp1 = icmp uge i16 %conv, %conv1
+  %select = select i1 %cmp1, i32 %b, i32 %c
+  br label %exit
+
+exit:
+  %retval = phi i32 [ %select, %if.then ], [ 0, %entry ]
+  ret i32 %retval
+}
+
+define void @search_back_through_trunc(i8* %a, i8* %b, i8* %c, i8* %d, i16* %e) {
+; CHECK-LABEL: @search_back_through_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[OR109:%.*]] = or i32 [[SHL]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[C:%.*]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    [[SHL120:%.*]] = shl nuw i32 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, i8* [[D:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[OR123:%.*]] = or i32 [[SHL120]], [[TMP7]]
+; CHECK-NEXT:    [[CMP133:%.*]] = icmp eq i32 [[OR109]], [[OR123]]
+; CHECK-NEXT:    br i1 [[CMP133]], label [[IF_END183:%.*]], label [[IF_ELSE136:%.*]]
+; CHECK:       if.else136:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, i16* [[E:%.*]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = and i32 [[TMP9]], 255
+; CHECK-NEXT:    [[EXTRACT856:%.*]] = lshr i32 [[TMP9]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = and i32 [[EXTRACT856]], 255
+; CHECK-NEXT:    br label [[IF_END183]]
+; CHECK:       if.end183:
+; CHECK-NEXT:    [[W_0_OFF0:%.*]] = phi i32 [ [[TMP10]], [[IF_ELSE136]] ], [ [[TMP3]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[W_0_OFF8:%.*]] = phi i32 [ [[TMP11]], [[IF_ELSE136]] ], [ [[TMP5]], [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = trunc i32 [[W_0_OFF8]] to i8
+; CHECK-NEXT:    store i8 [[TMP12]], i8* [[C]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc i32 [[W_0_OFF0]] to i8
+; CHECK-NEXT:    store i8 [[TMP13]], i8* [[D]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i8, i8* %a, align 1
+  %conv106 = zext i8 %0 to i16
+  %shl = shl nuw i16 %conv106, 8
+  %1 = load i8, i8* %b, align 1
+  %conv108 = zext i8 %1 to i16
+  %or109 = or i16 %shl, %conv108
+  %2 = load i8, i8* %c, align 1
+  %conv119 = zext i8 %2 to i16
+  %shl120 = shl nuw i16 %conv119, 8
+  %3 = load i8, i8* %d, align 1
+  %conv122 = zext i8 %3 to i16
+  %or123 = or i16 %shl120, %conv122
+  %cmp133 = icmp eq i16 %or109, %or123
+  br i1 %cmp133, label %if.end183, label %if.else136
+
+if.else136:
+  %4 = load i16, i16* %e, align 2
+  %extract.t854 = trunc i16 %4 to i8
+  %extract856 = lshr i16 %4, 8
+  %extract.t857 = trunc i16 %extract856 to i8
+  br label %if.end183
+
+if.end183:
+  %w.0.off0 = phi i8 [ %extract.t854, %if.else136 ], [ %1, %entry ]
+  %w.0.off8 = phi i8 [ %extract.t857, %if.else136 ], [ %2, %entry ]
+  store i8 %w.0.off8, i8* %c, align 1
+  store i8 %w.0.off0, i8* %d, align 1
+  ret void
+}
+
+@c = common dso_local local_unnamed_addr global i16 0, align 2
+@b = common dso_local local_unnamed_addr global i16 0, align 2
+@f = common dso_local local_unnamed_addr global i32 0, align 4
+@e = common dso_local local_unnamed_addr global i8 0, align 1
+@a = common dso_local local_unnamed_addr global i8 0, align 1
+@d = common dso_local local_unnamed_addr global i32 0, align 4
+
+define void @and_trunc_two_zext() {
+; CHECK-LABEL: @and_trunc_two_zext(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* @c, align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* @b, align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT:    store i32 [[CONV]], i32* @f, align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i16 [[TMP1]] to i8
+; CHECK-NEXT:    [[CONV1:%.*]] = and i8 [[TMP2]], 1
+; CHECK-NEXT:    store i8 [[CONV1]], i8* @e, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* @a, align 1
+; CHECK-NEXT:    [[NARROW:%.*]] = mul nuw i8 [[TMP3]], [[CONV1]]
+; CHECK-NEXT:    [[MUL:%.*]] = zext i8 [[NARROW]] to i32
+; CHECK-NEXT:    store i32 [[MUL]], i32* @d, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[NARROW]] to i16
+; CHECK-NEXT:    [[CONV5:%.*]] = or i16 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i16 [[CONV5]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[FOR_COND:%.*]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    br label [[FOR_COND]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i16, i16* @c, align 2
+  %1 = load i16, i16* @b, align 2
+  %conv = sext i16 %1 to i32
+  store i32 %conv, i32* @f, align 4
+  %2 = trunc i16 %1 to i8
+  %conv1 = and i8 %2, 1
+  store i8 %conv1, i8* @e, align 1
+  %3 = load i8, i8* @a, align 1
+  %narrow = mul nuw i8 %3, %conv1
+  %mul = zext i8 %narrow to i32
+  store i32 %mul, i32* @d, align 4
+  %4 = zext i8 %narrow to i16
+  %conv5 = or i16 %0, %4
+  %tobool = icmp eq i16 %conv5, 0
+  br i1 %tobool, label %if.end, label %for.cond
+
+for.cond:
+  br label %for.cond
+
+if.end:
+  ret void
+}
+
+define void @zext_urem_trunc() {
+; CHECK-LABEL: @zext_urem_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* @c, align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* @e, align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[REM7:%.*]] = urem i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[REM7]], 255
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP4]], [[COND_FALSE]] ], [ [[TMP3]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[COND]] to i8
+; CHECK-NEXT:    store i8 [[TMP5]], i8* @a, align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i16, i16* @c, align 2
+  %cmp = icmp eq i16 %0, 0
+  %1 = load i8, i8* @e, align 1
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:
+  %rem.lhs.trunc = zext i8 %1 to i16
+  %rem7 = urem i16 %rem.lhs.trunc, %0
+  %rem.zext = trunc i16 %rem7 to i8
+  br label %cond.end
+
+cond.end:
+  %cond = phi i8 [ %rem.zext, %cond.false ], [ %1, %entry ]
+  store i8 %cond, i8* @a, align 1
+  ret void
+}
+
+define i1 @dont_replace_trunc_1(i8* %a, i16* %b, i16* %c, i32* %d, i8* %e, i32* %f) {
+; CHECK-LABEL: @dont_replace_trunc_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[C:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[B:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-NEXT:    store i32 [[CONV]], i32* [[F:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i16 [[TMP1]] to i8
+; CHECK-NEXT:    [[CONV1:%.*]] = and i8 [[TMP2]], 1
+; CHECK-NEXT:    store i8 [[CONV1]], i8* [[E:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[A:%.*]], align 1
+; CHECK-NEXT:    [[NARROW:%.*]] = mul nuw i8 [[TMP3]], [[CONV1]]
+; CHECK-NEXT:    [[MUL:%.*]] = zext i8 [[NARROW]] to i32
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[D:%.*]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[NARROW]] to i16
+; CHECK-NEXT:    [[CONV5:%.*]] = or i16 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i16 [[CONV5]], 0
+; CHECK-NEXT:    ret i1 [[TOBOOL]]
+;
+entry:
+  %0 = load i16, i16* %c, align 2
+  %1 = load i16, i16* %b, align 2
+  %conv = sext i16 %1 to i32
+  store i32 %conv, i32* %f, align 4
+  %2 = trunc i16 %1 to i8
+  %conv1 = and i8 %2, 1
+  store i8 %conv1, i8* %e, align 1
+  %3 = load i8, i8* %a, align 1
+  %narrow = mul nuw i8 %3, %conv1
+  %mul = zext i8 %narrow to i32
+  store i32 %mul, i32* %d, align 4
+  %4 = zext i8 %narrow to i16
+  %conv5 = or i16 %0, %4
+  %tobool = icmp eq i16 %conv5, 0
+  ret i1 %tobool
+}
+
+define i32 @dont_replace_trunc_2(i16* %a, i8* %b) {
+; CHECK-LABEL: @dont_replace_trunc_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[A:%.*]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 8
+; CHECK-NEXT:    [[NARROW:%.*]] = select i1 [[CMP]], i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[NARROW]], 255
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[OR]] to i8
+; CHECK-NEXT:    store i8 [[TMP5]], i8* [[B]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[OR]] to i8
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    ret i32 [[CONV5]]
+;
+entry:
+  %0 = load i16, i16* %a, align 2
+  %cmp = icmp ugt i16 %0, 8
+  %narrow = select i1 %cmp, i16 %0, i16 0
+  %cond = trunc i16 %narrow to i8
+  %1 = load i8, i8* %b, align 1
+  %or = or i8 %1, %cond
+  store i8 %or, i8* %b, align 1
+  %conv5 = zext i8 %or to i32
+  ret i32 %conv5
+}
+
+define i32 @replace_trunk_with_mask(i16* %a) {
+; CHECK-LABEL: @replace_trunk_with_mask(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_END:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    [[TMP2:%.*]] = urem i32 535, [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = udiv i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; CHECK-NEXT:    [[PHITMP:%.*]] = zext i8 [[TMP5]] to i32
+; CHECK-NEXT:    br label [[COND_END]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[PHITMP]], [[COND_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[COND]]
+;
+entry:
+  %0 = load i16, i16* %a
+  %cmp = icmp eq i16 %0, 0
+  br i1 %cmp, label %cond.end, label %cond.false
+
+cond.false:
+  %1 = urem i16 535, %0
+  %.lhs.trunc = trunc i16 %1 to i8
+  %2 = udiv i8 %.lhs.trunc, 3
+  %phitmp = zext i8 %2 to i32
+  br label %cond.end
+
+cond.end:
+  %cond = phi i32 [ %phitmp, %cond.false ], [ 0, %entry ]
+  ret i32 %cond
+}
+
+define float @test_i8_sitofp(i8* %ptr, i8 %arg) {
+; CHECK-LABEL: @test_i8_sitofp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i8 [[ARG]] to float
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float [[CONV]], 2.000000e+01
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[DIV]], [[IF_END]] ]
+; CHECK-NEXT:    ret float [[RES]]
+;
+entry:
+  %0 = load i8, i8* %ptr, align 1
+  %cmp = icmp eq i8 %0, %arg
+  br i1 %cmp, label %exit, label %if.end
+
+if.end:
+  %conv = sitofp i8 %arg to float
+  %div = fdiv float %conv, 2.000000e+01
+  br label %exit
+
+exit:
+  %res = phi float [ 0.0, %entry ], [ %div, %if.end ]
+  ret float %res
+}
+
+define float @test_i16_sitofp(i16* %ptr, i16 %arg) {
+; CHECK-LABEL: @test_i16_sitofp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[TMP0]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i16 [[ARG]] to float
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv float [[CONV]], 2.000000e+01
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[DIV]], [[IF_END]] ]
+; CHECK-NEXT:    ret float [[RES]]
+;
+entry:
+  %0 = load i16, i16* %ptr, align 1
+  %cmp = icmp eq i16 %0, %arg
+  br i1 %cmp, label %exit, label %if.end
+
+if.end:
+  %conv = sitofp i16 %arg to float
+  %div = fdiv float %conv, 2.000000e+01
+  br label %exit
+
+exit:
+  %res = phi float [ 0.0, %entry ], [ %div, %if.end ]
+  ret float %res
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/clear-structures.ll b/llvm/test/Transforms/TypePromotion/ARM/clear-structures.ll
new file mode 100644
index 0000000000000..117c4c0d5c8a4
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/clear-structures.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define i32 @clear_structures(i8* nocapture readonly %fmt, [1 x i32] %ap.coerce, i8* %out, void (i32, i8*)* nocapture %write) {
+; CHECK-LABEL: @clear_structures(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[WHILE_COND_OUTER:%.*]]
+; CHECK:       while.cond.outer:
+; CHECK-NEXT:    [[FMT_ADDR_0_PH:%.*]] = phi i8* [ [[FMT:%.*]], [[ENTRY:%.*]] ], [ [[FMT_ADDR_3:%.*]], [[WHILE_COND_OUTER_BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[FMT_ADDR_0_PH]], align 1
+; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
+; CHECK:       while.cond:
+; CHECK-NEXT:    switch i8 [[TMP0]], label [[WHILE_COND]] [
+; CHECK-NEXT:    i8 0, label [[WHILE_END48:%.*]]
+; CHECK-NEXT:    i8 37, label [[WHILE_COND2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       while.cond2:
+; CHECK-NEXT:    [[FLAGS_0:%.*]] = phi i32 [ [[OR:%.*]], [[WHILE_COND2]] ], [ 0, [[WHILE_COND]] ]
+; CHECK-NEXT:    [[FMT_ADDR_0_PN:%.*]] = phi i8* [ [[FMT_ADDR_1:%.*]], [[WHILE_COND2]] ], [ [[FMT_ADDR_0_PH]], [[WHILE_COND]] ]
+; CHECK-NEXT:    [[FMT_ADDR_1]] = getelementptr inbounds i8, i8* [[FMT_ADDR_0_PN]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[FMT_ADDR_1]], align 1
+; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], -32
+; CHECK-NEXT:    [[CONV6:%.*]] = zext i8 [[SUB]] to i32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[CONV6]]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], 75785
+; CHECK-NEXT:    [[TOBOOL7:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    [[OR]] = or i32 [[SHL]], [[FLAGS_0]]
+; CHECK-NEXT:    br i1 [[TOBOOL7]], label [[WHILE_COND10_PREHEADER:%.*]], label [[WHILE_COND2]]
+; CHECK:       while.cond10.preheader:
+; CHECK-NEXT:    [[DOTOFF:%.*]] = add i8 [[TMP1]], -48
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[DOTOFF]], 10
+; CHECK-NEXT:    br i1 [[TMP2]], label [[WHILE_COND10:%.*]], label [[WHILE_END18_SPLIT:%.*]]
+; CHECK:       while.cond10:
+; CHECK-NEXT:    br label [[WHILE_COND10]]
+; CHECK:       while.end18.split:
+; CHECK-NEXT:    [[CMP20:%.*]] = icmp eq i8 [[TMP1]], 46
+; CHECK-NEXT:    br i1 [[CMP20]], label [[IF_THEN22:%.*]], label [[COND_END:%.*]]
+; CHECK:       if.then22:
+; CHECK-NEXT:    [[INCDEC_PTR23:%.*]] = getelementptr inbounds i8, i8* [[FMT_ADDR_0_PN]], i32 2
+; CHECK-NEXT:    [[DOTPR74:%.*]] = load i8, i8* [[INCDEC_PTR23]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[DOTPR74]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP3]], 48
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 10
+; CHECK-NEXT:    br i1 [[TMP5]], label [[WHILE_COND24:%.*]], label [[COND_END]]
+; CHECK:       while.cond24:
+; CHECK-NEXT:    br label [[WHILE_COND24]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[FMT_ADDR_3]] = phi i8* [ [[FMT_ADDR_1]], [[WHILE_END18_SPLIT]] ], [ [[INCDEC_PTR23]], [[IF_THEN22]] ]
+; CHECK-NEXT:    [[AND39:%.*]] = and i32 [[FLAGS_0]], 2048
+; CHECK-NEXT:    [[TOBOOL40:%.*]] = icmp eq i32 [[AND39]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL40]], label [[WHILE_COND_OUTER_BACKEDGE]], label [[IF_THEN43:%.*]]
+; CHECK:       while.cond.outer.backedge:
+; CHECK-NEXT:    br label [[WHILE_COND_OUTER]]
+; CHECK:       if.then43:
+; CHECK-NEXT:    tail call void [[WRITE:%.*]](i32 43, i8* [[OUT:%.*]])
+; CHECK-NEXT:    br label [[WHILE_COND_OUTER_BACKEDGE]]
+; CHECK:       while.end48:
+; CHECK-NEXT:    ret i32 undef
+;
+entry:
+  br label %while.cond.outer
+
+while.cond.outer:
+  %fmt.addr.0.ph = phi i8* [ %fmt, %entry ], [ %fmt.addr.3, %while.cond.outer.backedge ]
+  %0 = load i8, i8* %fmt.addr.0.ph, align 1
+  br label %while.cond
+
+while.cond:
+  switch i8 %0, label %while.cond [
+  i8 0, label %while.end48
+  i8 37, label %while.cond2
+  ]
+
+while.cond2:
+  %flags.0 = phi i32 [ %or, %while.cond2 ], [ 0, %while.cond ]
+  %fmt.addr.0.pn = phi i8* [ %fmt.addr.1, %while.cond2 ], [ %fmt.addr.0.ph, %while.cond ]
+  %fmt.addr.1 = getelementptr inbounds i8, i8* %fmt.addr.0.pn, i32 1
+  %1 = load i8, i8* %fmt.addr.1, align 1
+  %sub = add i8 %1, -32
+  %conv6 = zext i8 %sub to i32
+  %shl = shl i32 1, %conv6
+  %and = and i32 %shl, 75785
+  %tobool7 = icmp eq i32 %and, 0
+  %or = or i32 %shl, %flags.0
+  br i1 %tobool7, label %while.cond10.preheader, label %while.cond2
+
+while.cond10.preheader:
+  %.off = add i8 %1, -48
+  %2 = icmp ult i8 %.off, 10
+  br i1 %2, label %while.cond10, label %while.end18.split
+
+while.cond10:
+  br label %while.cond10
+
+while.end18.split:
+  %cmp20 = icmp eq i8 %1, 46
+  br i1 %cmp20, label %if.then22, label %cond.end
+
+if.then22:
+  %incdec.ptr23 = getelementptr inbounds i8, i8* %fmt.addr.0.pn, i32 2
+  %.pr74 = load i8, i8* %incdec.ptr23, align 1
+  %.pr74.off = add i8 %.pr74, -48
+  %3 = icmp ult i8 %.pr74.off, 10
+  br i1 %3, label %while.cond24, label %cond.end
+
+while.cond24:
+  br label %while.cond24
+
+cond.end:
+  %fmt.addr.3 = phi i8* [ %fmt.addr.1, %while.end18.split ], [ %incdec.ptr23, %if.then22 ]
+  %and39 = and i32 %flags.0, 2048
+  %tobool40 = icmp eq i32 %and39, 0
+  br i1 %tobool40, label %while.cond.outer.backedge, label %if.then43
+
+while.cond.outer.backedge:
+  br label %while.cond.outer
+
+if.then43:
+  tail call void %write(i32 43, i8* %out) #1
+  br label %while.cond.outer.backedge
+
+while.end48:
+  ret i32 undef
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
new file mode 100644
index 0000000000000..6dda15c309b4a
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/icmps.ll
@@ -0,0 +1,349 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define i32 @test_ult_254_inc_imm(i8 zeroext %x) {
+; CHECK-LABEL: @test_ult_254_inc_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, 1
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_slt_254_inc_imm(i8 signext %x) {
+; CHECK-LABEL: @test_slt_254_inc_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, 1
+  %cmp = icmp slt i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_ult_254_inc_var(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: @test_ult_254_inc_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, %y
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_sle_254_inc_var(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_sle_254_inc_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, %y
+  %cmp = icmp sle i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_ugt_1_dec_imm(i8 zeroext %x) {
+; CHECK-LABEL: @test_ugt_1_dec_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp ugt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_sgt_1_dec_imm(i8 %x) {
+; CHECK-LABEL: @test_sgt_1_dec_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[ADD]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp sgt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: @test_ugt_1_dec_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp ugt i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @test_sge_1_dec_var(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_sge_1_dec_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[SUB]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp sge i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @dsp_imm1(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: @dsp_imm1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 7
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[AND]], [[XOR]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SUB]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %xor = xor i8 %x, %y
+  %and = and i8 %x, 7
+  %sub = sub i8 %and, %xor
+  %add = add i8 %sub, 1
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @dsp_var(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: @dsp_var(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 7
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[AND]], [[XOR]]
+; CHECK-NEXT:    [[MUL:%.*]] = shl nuw i8 [[X]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[SUB]], [[MUL]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %xor = xor i8 %x, %y
+  %and = and i8 %x, 7
+  %sub = sub i8 %and, %xor
+  %mul = shl nuw i8 %x, 1
+  %add = add i8 %sub, %mul
+  %cmp = icmp ult i8 %add, 254
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define void @store_dsp_res(i8* %in, i8* %out, i8 %compare) {
+; CHECK-LABEL: @store_dsp_res(
+; CHECK-NEXT:    [[FIRST:%.*]] = getelementptr inbounds i8, i8* [[IN:%.*]], i32 0
+; CHECK-NEXT:    [[SECOND:%.*]] = getelementptr inbounds i8, i8* [[IN]], i32 1
+; CHECK-NEXT:    [[LD0:%.*]] = load i8, i8* [[FIRST]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, i8* [[SECOND]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[LD0]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[COMPARE:%.*]], [[LD1]]
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP]], i8 [[COMPARE]], i8 [[XOR]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[LD0]], [[SELECT]]
+; CHECK-NEXT:    store i8 [[SUB]], i8* [[OUT:%.*]], align 1
+; CHECK-NEXT:    ret void
+;
+  %first = getelementptr inbounds i8, i8* %in, i32 0
+  %second = getelementptr inbounds i8, i8* %in, i32 1
+  %ld0 = load i8, i8* %first
+  %ld1 = load i8, i8* %second
+  %xor = xor i8 %ld0, -1
+  %cmp = icmp ult i8 %compare, %ld1
+  %select = select i1 %cmp, i8 %compare, i8 %xor
+  %sub = sub i8 %ld0, %select
+  store i8 %sub, i8* %out, align 1
+  ret void
+}
+
+define i32 @ugt_1_dec_imm(i8 zeroext %x) {
+; CHECK-LABEL: @ugt_1_dec_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %add = add i8 %x, -1
+  %cmp = icmp ugt i8 %add, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @ugt_1_dec_var(i8 zeroext %x, i8 zeroext %y) {
+; CHECK-LABEL: @ugt_1_dec_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], 1
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 35, i32 47
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %sub = sub i8 %x, %y
+  %cmp = icmp ugt i8 %sub, 1
+  %res = select i1 %cmp, i32 35, i32 47
+  ret i32 %res
+}
+
+define i32 @icmp_eq_minus_one(i8* %ptr) {
+; CHECK-LABEL: @icmp_eq_minus_one(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[LOAD]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[LOAD]], -1
+; CHECK-NEXT:    [[RET:%.*]] = select i1 [[CMP]], i32 [[CONV]], i32 -1
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %load = load i8, i8* %ptr, align 1
+  %conv = zext i8 %load to i32
+  %cmp = icmp eq i8 %load, -1
+  %ret = select i1 %cmp, i32 %conv, i32 -1
+  ret i32 %ret
+}
+
+define i32 @icmp_not(i16 zeroext %arg0, i16 zeroext %arg1) {
+; CHECK-LABEL: @icmp_not(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[NOT:%.*]] = xor i32 [[TMP1]], 65535
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[NOT]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 16, i32 32
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %not = xor i16 %arg0, -1
+  %cmp = icmp eq i16 %not, %arg1
+  %res = select i1 %cmp, i32 16, i32 32
+  ret i32 %res
+}
+
+define i32 @icmp_i1(i1* %arg0, i1 zeroext %arg1, i32 %a, i32 %b) {
+; CHECK-LABEL: @icmp_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i1, i1* [[ARG0:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[LOAD]], true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i1 [[ARG1:%.*]], [[NOT]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[A:%.*]], i32 [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %load = load i1, i1* %arg0
+  %not = xor i1 %load, 1
+  %cmp = icmp eq i1 %arg1, %not
+  %res = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @icmp_i7(i7* %arg0, i7 zeroext %arg1, i32 %a, i32 %b) {
+; CHECK-LABEL: @icmp_i7(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i7 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[LOAD:%.*]] = load i7, i7* [[ARG0:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i7 [[LOAD]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP0]], [[ADD]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[A:%.*]], i32 [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %load = load i7, i7* %arg0
+  %add = add nuw i7 %load, 1
+  %cmp = icmp ult i7 %arg1, %add
+  %res = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %res
+}
+
+define i32 @icmp_i15(i15 zeroext %arg0, i15 zeroext %arg1) {
+; CHECK-LABEL: @icmp_i15(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i15 [[ARG0:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i15 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[TMP1]], 32767
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[XOR]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 21, i32 42
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %xor = xor i15 %arg0, -1
+  %cmp = icmp eq i15 %xor, %arg1
+  %res = select i1 %cmp, i32 21, i32 42
+  ret i32 %res
+}
+
+define i32 @icmp_minus_imm(i8* %a) {
+; CHECK-LABEL: @icmp_minus_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A:%.*]], align 1
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i8 [[TMP0]], -7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD_I]], -5
+; CHECK-NEXT:    [[CONV1:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV1]]
+;
+entry:
+  %0 = load i8, i8* %a, align 1
+  %add.i = add i8 %0, -7
+  %cmp = icmp ugt i8 %add.i, -5
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+define void @mul_with_neg_imm(i32, i32* %b) {
+; CHECK-LABEL: @mul_with_neg_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0:%.*]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 1
+; CHECK-NEXT:    [[CONV_I:%.*]] = mul nuw i32 [[TMP3]], 132
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[CONV_I]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 0, i32* [[B:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %1 = trunc i32 %0 to i8
+  %2 = and i8 %1, 1
+  %conv.i = mul nuw i8 %2, -124
+  %tobool = icmp eq i8 %conv.i, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 0, i32* %b, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/lit.local.cfg b/llvm/test/Transforms/TypePromotion/ARM/lit.local.cfg
new file mode 100644
index 0000000000000..236e1d3441665
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'ARM' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll
new file mode 100644
index 0000000000000..e79e4ff1bdb2e
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/phis-ret.ll
@@ -0,0 +1,344 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+; Check that the arguments are extended but then nothing else is.
+; This also ensures that the pass can handle loops.
+define void @phi_feeding_phi_args(i8 %a, i8 %b) {
+; CHECK-LABEL: @phi_feeding_phi_args(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PREHEADER:%.*]], label [[EMPTY:%.*]]
+; CHECK:       empty:
+; CHECK-NEXT:    br label [[PREHEADER]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP1]], [[EMPTY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ [[TMP3]], [[PREHEADER]] ], [ [[INC2:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[VAL]], 254
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC:%.*]] = sub nuw i32 [[VAL]], 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[INC1:%.*]] = shl nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC2]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[INC1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[INC2]], 255
+; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:
+  br label %preheader
+
+preheader:
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 254
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, 255
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same as above, but as the args are zeroext, we shouldn't see any uxts.
+define void @phi_feeding_phi_zeroext_args(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: @phi_feeding_phi_zeroext_args(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[B:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PREHEADER:%.*]], label [[EMPTY:%.*]]
+; CHECK:       empty:
+; CHECK-NEXT:    br label [[PREHEADER]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP1]], [[EMPTY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ [[TMP3]], [[PREHEADER]] ], [ [[INC2:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[VAL]], 254
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC:%.*]] = sub nuw i32 [[VAL]], 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[INC1:%.*]] = shl nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC2]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[INC1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[INC2]], 255
+; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = icmp ugt i8 %a, %b
+  br i1 %0, label %preheader, label %empty
+
+empty:
+  br label %preheader
+
+preheader:
+  %1 = phi i8 [ %a, %entry ], [ %b, %empty ]
+  br label %loop
+
+loop:
+  %val = phi i8 [ %1, %preheader ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 254
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = sub nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = shl nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp eq i8 %inc2, 255
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Just check that phis also work with i16s.
+define void @phi_i16() {
+; CHECK-LABEL: @phi_i16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[VAL]], 128
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC:%.*]] = add nuw i32 [[VAL]], 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[INC1:%.*]] = add nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC2]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[INC1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC2]], 253
+; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %val = phi i16 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i16 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = add nuw i16 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = add nuw i16 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i16 %inc2, 253
+  br i1 %cmp1, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define i8 @ret_i8() {
+; CHECK-LABEL: @ret_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[VAL]], 128
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC:%.*]] = add nuw i32 [[VAL]], 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[INC1:%.*]] = add nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC2]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[INC1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC2]], 253
+; CHECK-NEXT:    br i1 [[CMP1]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INC2]] to i8
+; CHECK-NEXT:    ret i8 [[TMP0]]
+;
+entry:
+  br label %loop
+
+loop:
+  %val = phi i8 [ 0, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i8 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = add nuw i8 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = add nuw i8 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i8 [ %inc, %if.then], [ %inc1, %if.else ]
+  %cmp1 = icmp ult i8 %inc2, 253
+  br i1 %cmp1, label %exit, label %loop
+
+exit:
+  ret i8 %inc2
+}
+
+define i16 @phi_multiple_undefs(i16 zeroext %arg) {
+; CHECK-LABEL: @phi_multiple_undefs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[INC2:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[VAL]], 128
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[INC:%.*]] = add nuw i32 [[VAL]], 2
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[INC1:%.*]] = add nuw i32 [[VAL]], 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[INC2]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[INC1]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[UNRELATED:%.*]] = phi i16 [ undef, [[IF_THEN]] ], [ [[ARG:%.*]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[INC2]], 253
+; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i16 [[UNRELATED]]
+;
+entry:
+  br label %loop
+
+loop:
+  %val = phi i16 [ undef, %entry ], [ %inc2, %if.end ]
+  %cmp = icmp ult i16 %val, 128
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %inc = add nuw i16 %val, 2
+  br label %if.end
+
+if.else:
+  %inc1 = add nuw i16 %val, 1
+  br label %if.end
+
+if.end:
+  %inc2 = phi i16 [ %inc, %if.then], [ %inc1, %if.else ]
+  %unrelated = phi i16 [ undef, %if.then ], [ %arg, %if.else ]
+  %cmp1 = icmp ult i16 %inc2, 253
+  br i1 %cmp1, label %loop, label %exit
+
+exit:
+  ret i16 %unrelated
+}
+
+define i16 @promote_arg_return(i16 zeroext %arg1, i16 zeroext %arg2, i8* %res) {
+; CHECK-LABEL: @promote_arg_return(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[ARG2:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP1]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[ADD]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[MUL]], [[TMP2]]
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT:    store i8 [[CONV]], i8* [[RES:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[TMP3]]
+;
+  %add = add nuw i16 %arg1, 15
+  %mul = mul nuw nsw i16 %add, 3
+  %cmp = icmp ult i16 %mul, %arg2
+  %conv = zext i1 %cmp to i8
+  store i8 %conv, i8* %res
+  ret i16 %arg1
+}
+
+define i16 @signext_bitcast_phi_select(i16 signext %start, i16* %in) {
+; CHECK-LABEL: @signext_bitcast_phi_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[START:%.*]] to i32
+; CHECK-NEXT:    [[CONST:%.*]] = bitcast i16 -1 to i16
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[SELECT:%.*]], [[IF_ELSE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[IDX]] to i16
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i16 [[TMP1]], [[CONST]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[EXIT:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[IDX_NEXT:%.*]] = getelementptr i16, i16* [[IN:%.*]], i32 [[IDX]]
+; CHECK-NEXT:    [[LD:%.*]] = load i16, i16* [[IDX_NEXT]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[LD]] to i32
+; CHECK-NEXT:    [[CMP1_I:%.*]] = icmp eq i32 [[TMP2]], [[IDX]]
+; CHECK-NEXT:    br i1 [[CMP1_I]], label [[EXIT]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[LOBIT:%.*]] = lshr i32 [[IDX]], 15
+; CHECK-NEXT:    [[LOBIT_NOT:%.*]] = xor i32 [[LOBIT]], 1
+; CHECK-NEXT:    [[SELECT]] = add nuw i32 [[LOBIT_NOT]], [[IDX]]
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RES]] to i16
+; CHECK-NEXT:    ret i16 [[TMP3]]
+;
+entry:
+  %const = bitcast i16 -1 to i16
+  br label %for.body
+
+for.body:
+  %idx = phi i16 [ %select, %if.else ], [ %start, %entry ]
+  %cmp.i = icmp sgt i16 %idx, %const
+  br i1 %cmp.i, label %exit, label %if.then
+
+if.then:
+  %idx.next = getelementptr i16, i16* %in, i16 %idx
+  %ld = load i16, i16* %idx.next, align 2
+  %cmp1.i = icmp eq i16 %ld, %idx
+  br i1 %cmp1.i, label %exit, label %if.else
+
+if.else:
+  %lobit = lshr i16 %idx, 15
+  %lobit.not = xor i16 %lobit, 1
+  %select = add nuw i16 %lobit.not, %idx
+  br label %for.body
+
+exit:
+  %res = phi i16 [ %ld, %if.then ], [ 0, %for.body ]
+  ret i16 %res
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
new file mode 100644
index 0000000000000..3c5f097b1b92b
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define void @phi_pointers(i16* %a, i16* %b, i8 zeroext %M, i8 zeroext %N) {
+; CHECK-LABEL: @phi_pointers(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[M:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    [[BASE:%.*]] = select i1 [[CMP]], i16* [[A:%.*]], i16* [[B:%.*]]
+; CHECK-NEXT:    [[OTHER:%.*]] = select i1 [[CMP]], i16* [[B]], i16* [[B]]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR:%.*]] = phi i16* [ [[BASE]], [[ENTRY:%.*]] ], [ [[GEP:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[AND]], [[ENTRY]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[PTR]], align 2
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IDX]], 1
+; CHECK-NEXT:    [[GEP]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[INC]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i16* [[GEP]], [[OTHER]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nuw i8 %M, 1
+  %and = and i8 %add, 1
+  %cmp = icmp ugt i8 %add, %N
+  %base = select i1 %cmp, i16* %a, i16* %b
+  %other = select i1 %cmp, i16* %b, i16* %b
+  br label %loop
+
+loop:
+  %ptr = phi i16* [ %base, %entry ], [ %gep, %loop ]
+  %idx = phi i8 [ %and, %entry ], [ %inc, %loop ]
+  %load = load i16, i16* %ptr, align 2
+  %inc = add nuw nsw i8 %idx, 1
+  %gep = getelementptr inbounds i16, i16* %ptr, i8 %inc
+  %cond = icmp eq i16* %gep, %other
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @phi_pointers_null(i16* %a, i16* %b, i8 zeroext %M, i8 zeroext %N) {
+; CHECK-LABEL: @phi_pointers_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[M:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], [[TMP1]]
+; CHECK-NEXT:    [[BASE:%.*]] = select i1 [[CMP]], i16* [[A:%.*]], i16* [[B:%.*]]
+; CHECK-NEXT:    [[OTHER:%.*]] = select i1 [[CMP]], i16* [[B]], i16* [[B]]
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp eq i16* [[BASE]], [[OTHER]]
+; CHECK-NEXT:    br i1 [[CMP_1]], label [[FAIL:%.*]], label [[LOOP:%.*]]
+; CHECK:       fail:
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[PTR:%.*]] = phi i16* [ [[BASE]], [[ENTRY:%.*]] ], [ null, [[FAIL]] ], [ [[GEP:%.*]], [[IF_THEN:%.*]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[AND]], [[ENTRY]] ], [ 0, [[FAIL]] ], [ [[INC:%.*]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[UNDEF:%.*]] = icmp eq i16* [[PTR]], undef
+; CHECK-NEXT:    br i1 [[UNDEF]], label [[EXIT:%.*]], label [[IF_THEN]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[PTR]], align 2
+; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[IDX]], 1
+; CHECK-NEXT:    [[GEP]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[INC]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i16* [[GEP]], [[OTHER]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add nuw i8 %M, 1
+  %and = and i8 %add, 1
+  %cmp = icmp ugt i8 %add, %N
+  %base = select i1 %cmp, i16* %a, i16* %b
+  %other = select i1 %cmp, i16* %b, i16* %b
+  %cmp.1 = icmp eq i16* %base, %other
+  br i1 %cmp.1, label %fail, label %loop
+
+fail:
+  br label %loop
+
+loop:
+  %ptr = phi i16* [ %base, %entry ], [ null, %fail ], [ %gep, %if.then ]
+  %idx = phi i8 [ %and, %entry ], [ 0, %fail ], [ %inc, %if.then ]
+  %undef = icmp eq i16* %ptr, undef
+  br i1 %undef, label %exit, label %if.then
+
+if.then:
+  %load = load i16, i16* %ptr, align 2
+  %inc = add nuw nsw i8 %idx, 1
+  %gep = getelementptr inbounds i16, i16* %ptr, i8 %inc
+  %cond = icmp eq i16* %gep, %other
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare i8 @do_something_with_ptr(i8, i16*)
+
+define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, i16* %a, i16* %b) {
+; CHECK-LABEL: @call_pointer(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[OR]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[SHR]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[ADD]], 0
+; CHECK-NEXT:    [[PTR:%.*]] = select i1 [[CMP]], i16* [[A:%.*]], i16* [[B:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[SHR]] to i8
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i8 @do_something_with_ptr(i8 [[TMP3]], i16* [[PTR]])
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i8 [[CALL]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i8
+; CHECK-NEXT:    ret i8 [[TMP5]]
+;
+  %or = or i8 %x, %y
+  %shr = lshr i8 %or, 1
+  %add = add nuw i8 %shr, 2
+  %cmp = icmp ne i8 %add, 0
+  %ptr = select i1 %cmp, i16* %a, i16* %b
+  %call = tail call zeroext i8 @do_something_with_ptr(i8 %shr, i16* %ptr)
+  ret i8 %call
+}
+
+define i16 @pointer_to_pointer(i16** %arg, i16 zeroext %limit) {
+; CHECK-LABEL: @pointer_to_pointer(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADDR:%.*]] = load i16*, i16** [[ARG:%.*]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, i16* [[ADDR]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[VAL]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 7
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[ADD]], 256
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 128, i16 255
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+entry:
+  %addr = load i16*, i16** %arg
+  %val = load i16, i16* %addr
+  %add = add nuw i16 %val, 7
+  %cmp = icmp ult i16 %add, 256
+  %res = select i1 %cmp, i16 128, i16 255
+  ret i16 %res
+}
+
+define i8 @gep_2d_array(i8** %a, i8 zeroext %arg) {
+; CHECK-LABEL: @gep_2d_array(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i8*, i8** [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8*, i8** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, i8* [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[TMP2]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw i32 [[TMP3]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[SUB]], [[TMP0]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i8 27, i8 54
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+entry:
+  %arrayidx.us = getelementptr inbounds i8*, i8** %a, i32 0
+  %0 = load i8*, i8** %arrayidx.us, align 4
+  %1 = load i8, i8* %0, align 1
+  %sub = sub nuw i8 %1, 1
+  %cmp = icmp ult i8 %sub, %arg
+  %res = select i1 %cmp, i8 27, i8 54
+  ret i8 %res
+}
+
+define void @gep_2d_array_loop(i16** nocapture readonly %a, i16** nocapture readonly %b, i32 %N) {
+; CHECK-LABEL: @gep_2d_array_loop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP30]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]]
+; CHECK:       for.cond1.preheader.us:
+; CHECK-NEXT:    [[Y_031_US:%.*]] = phi i32 [ [[INC13_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
+; CHECK:       for.body4.us:
+; CHECK-NEXT:    [[X_029_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16*, i16** [[A:%.*]], i32 [[X_029_US]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16*, i16** [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5_US:%.*]] = getelementptr inbounds i16, i16* [[TMP0]], i32 [[Y_031_US]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX5_US]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[DEC_US:%.*]] = add nuw i32 [[TMP2]], 65535
+; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp ult i32 [[DEC_US]], 16383
+; CHECK-NEXT:    [[SHL_US:%.*]] = shl nuw i32 [[DEC_US]], 2
+; CHECK-NEXT:    [[SPEC_SELECT_US:%.*]] = select i1 [[CMP6_US]], i32 [[SHL_US]], i32 [[DEC_US]]
+; CHECK-NEXT:    [[ARRAYIDX10_US:%.*]] = getelementptr inbounds i16*, i16** [[B:%.*]], i32 [[X_029_US]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16*, i16** [[ARRAYIDX10_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX11_US:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[Y_031_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[SPEC_SELECT_US]] to i16
+; CHECK-NEXT:    store i16 [[TMP4]], i16* [[ARRAYIDX11_US]], align 2
+; CHECK-NEXT:    [[INC_US]] = add nuw i32 [[X_029_US]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_US]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]]
+; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
+; CHECK-NEXT:    [[INC13_US]] = add nuw i32 [[Y_031_US]], 1
+; CHECK-NEXT:    [[EXITCOND32:%.*]] = icmp eq i32 [[INC13_US]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_COND_CLEANUP]], label [[FOR_COND1_PREHEADER_US]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp30 = icmp eq i32 %N, 0
+  br i1 %cmp30, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond1.preheader.us:
+  %y.031.us = phi i32 [ %inc13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
+  br label %for.body4.us
+
+for.body4.us:
+  %x.029.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
+  %arrayidx.us = getelementptr inbounds i16*, i16** %a, i32 %x.029.us
+  %0 = load i16*, i16** %arrayidx.us, align 4
+  %arrayidx5.us = getelementptr inbounds i16, i16* %0, i32 %y.031.us
+  %1 = load i16, i16* %arrayidx5.us, align 2
+  %dec.us = add nuw i16 %1, -1
+  %cmp6.us = icmp ult i16 %dec.us, 16383
+  %shl.us = shl nuw i16 %dec.us, 2
+  %spec.select.us = select i1 %cmp6.us, i16 %shl.us, i16 %dec.us
+  %arrayidx10.us = getelementptr inbounds i16*, i16** %b, i32 %x.029.us
+  %2 = load i16*, i16** %arrayidx10.us, align 4
+  %arrayidx11.us = getelementptr inbounds i16, i16* %2, i32 %y.031.us
+  store i16 %spec.select.us, i16* %arrayidx11.us, align 2
+  %inc.us = add nuw i32 %x.029.us, 1
+  %exitcond = icmp eq i32 %inc.us, %N
+  br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
+
+for.cond1.for.cond.cleanup3_crit_edge.us:
+  %inc13.us = add nuw i32 %y.031.us, 1
+  %exitcond32 = icmp eq i32 %inc13.us, %N
+  br i1 %exitcond32, label %for.cond.cleanup, label %for.cond1.preheader.us
+
+for.cond.cleanup:
+  ret void
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/signed-icmps.ll b/llvm/test/Transforms/TypePromotion/ARM/signed-icmps.ll
new file mode 100644
index 0000000000000..dfdd4c10ae87a
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/signed-icmps.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define i8 @eq_sgt(i8* %x, i8 *%y, i8 zeroext %z) {
+; CHECK-LABEL: @eq_sgt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i8, i8* [[X:%.*]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i8, i8* [[Y:%.*]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[LOAD0]], [[Z:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[LOAD1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[ADD]], -56
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i8 [[SUB]], [[ADD]]
+; CHECK-NEXT:    [[RES0:%.*]] = select i1 [[CMP]], i8 35, i8 47
+; CHECK-NEXT:    [[RES1:%.*]] = select i1 [[CMP1]], i8 [[RES0]], i8 [[SUB]]
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+entry:
+  %load0 = load i8, i8* %x, align 1
+  %load1 = load i8, i8* %y, align 1
+  %add = add i8 %load0, %z
+  %sub = sub i8 %load1, 1
+  %cmp = icmp eq i8 %add, 200
+  %cmp1 = icmp sgt i8 %sub, %add
+  %res0 = select i1 %cmp, i8 35, i8 47
+  %res1 = select i1 %cmp1, i8 %res0, i8 %sub
+  ret i8 %res1
+}
+
+define i16 @ugt_slt(i16 *%x, i16 zeroext %y, i16 zeroext %z) {
+; CHECK-LABEL: @ugt_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i16, i16* [[X:%.*]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[LOAD0]], [[Z:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i16 [[Y:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i16 [[ADD]], [[Z]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i16 [[SUB]], [[Z]]
+; CHECK-NEXT:    [[RES0:%.*]] = select i1 [[CMP]], i16 35, i16 -1
+; CHECK-NEXT:    [[RES1:%.*]] = select i1 [[CMP1]], i16 [[RES0]], i16 0
+; CHECK-NEXT:    ret i16 [[RES1]]
+;
+entry:
+  %load0 = load i16, i16* %x, align 1
+  %add = add i16 %load0, %z
+  %sub = sub i16 %y, 1
+  %cmp = icmp slt i16 %add, %z
+  %cmp1 = icmp ugt i16 %sub, %z
+  %res0 = select i1 %cmp, i16 35, i16 -1
+  %res1 = select i1 %cmp1, i16 %res0, i16 0
+  ret i16 %res1
+}
+
+define void @urem_trunc_icmps(i16** %in, i32* %g, i32* %k) {
+; CHECK-LABEL: @urem_trunc_icmps(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = load i16*, i16** [[IN:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i16, i16* [[PTR]], align 2
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i16 [[LD]], 0
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[EXIT:%.*]], label [[COND_FALSE_I:%.*]]
+; CHECK:       cond.false.i:
+; CHECK-NEXT:    [[REM:%.*]] = urem i16 5, [[LD]]
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i16 [[REM]] to i8
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[COND_IN_I_OFF0:%.*]] = phi i8 [ [[EXTRACT_T]], [[COND_FALSE_I]] ], [ [[ADD:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[COND_IN_I_OFF0]], 7
+; CHECK-NEXT:    [[CONV5:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    store i32 [[CONV5]], i32* [[G:%.*]], align 4
+; CHECK-NEXT:    [[DOTPR:%.*]] = load i32, i32* [[K:%.*]], align 4
+; CHECK-NEXT:    [[TOBOOL13150:%.*]] = icmp eq i32 [[DOTPR]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL13150]], label [[FOR_INC]], label [[EXIT]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[ADD]] = add nuw i8 [[COND_IN_I_OFF0]], 1
+; CHECK-NEXT:    br label [[BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr = load i16*, i16** %in, align 4
+  %ld = load i16, i16* %ptr, align 2
+  %cmp.i = icmp eq i16 %ld, 0
+  br i1 %cmp.i, label %exit, label %cond.false.i
+
+cond.false.i:
+  %rem = urem i16 5, %ld
+  %extract.t = trunc i16 %rem to i8
+  br label %body
+
+body:
+  %cond.in.i.off0 = phi i8 [ %extract.t, %cond.false.i ], [ %add, %for.inc ]
+  %cmp = icmp sgt i8 %cond.in.i.off0, 7
+  %conv5 = zext i1 %cmp to i32
+  store i32 %conv5, i32* %g, align 4
+  %.pr = load i32, i32* %k, align 4
+  %tobool13150 = icmp eq i32 %.pr, 0
+  br i1 %tobool13150, label %for.inc, label %exit
+
+for.inc:
+  %add = add nuw i8 %cond.in.i.off0, 1
+  br label %body
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/signed.ll b/llvm/test/Transforms/TypePromotion/ARM/signed.ll
new file mode 100644
index 0000000000000..143220a53b5c2
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/signed.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+; Test to check that ARMCodeGenPrepare doesn't optimised away sign extends.
+define i16 @test_signed_load(i16* %ptr) {
+; CHECK-LABEL: @test_signed_load(
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, i16* [[PTR:%.*]]
+; CHECK-NEXT:    [[CONV0:%.*]] = zext i16 [[LOAD]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[LOAD]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CONV0]], [[CONV1]]
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    ret i16 [[CONV2]]
+;
+  %load = load i16, i16* %ptr
+  %conv0 = zext i16 %load to i32
+  %conv1 = sext i16 %load to i32
+  %cmp = icmp eq i32 %conv0, %conv1
+  %conv2 = zext i1 %cmp to i16
+  ret i16 %conv2
+}
+
+; Don't allow sign bit generating opcodes.
+define i16 @test_ashr(i16 zeroext %arg) {
+; CHECK-LABEL: @test_ashr(
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[ARG:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[ASHR]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %ashr = ashr i16 %arg, 1
+  %cmp = icmp eq i16 %ashr, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i16 @test_sdiv(i16 zeroext %arg) {
+; CHECK-LABEL: @test_sdiv(
+; CHECK-NEXT:    [[SDIV:%.*]] = sdiv i16 [[ARG:%.*]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i16 [[SDIV]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %sdiv = sdiv i16 %arg, 2
+  %cmp = icmp ne i16 %sdiv, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i16 @test_srem(i16 zeroext %arg) {
+; CHECK-LABEL: @test_srem(
+; CHECK-NEXT:    [[SREM:%.*]] = srem i16 [[ARG:%.*]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i16 [[SREM]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i16
+; CHECK-NEXT:    ret i16 [[CONV]]
+;
+  %srem = srem i16 %arg, 4
+  %cmp = icmp ne i16 %srem, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+define i32 @test_signext_b(i8* %ptr, i8 signext %arg) {
+; CHECK-LABEL: @test_signext_b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP3]], 128
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 42, i32 20894
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %0 = load i8, i8* %ptr, align 1
+  %1 = add nuw nsw i8 %0, %arg
+  %cmp = icmp ult i8 %1, 128
+  %res = select i1 %cmp, i32 42, i32 20894
+  ret i32 %res
+}
+
+define i32 @test_signext_b_ult_slt(i8* %ptr, i8 signext %arg) {
+; CHECK-LABEL: @test_signext_b_ult_slt(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[TMP4]], 126
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ule i32 [[TMP3]], [[TMP0]]
+; CHECK-NEXT:    [[OR:%.*]] = and i1 [[CMP]], [[CMP_1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[OR]], i32 42, i32 57
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %0 = load i8, i8* %ptr, align 1
+  %1 = add nuw nsw i8 %0, %arg
+  %cmp = icmp sle i8 %1, 126
+  %cmp.1 = icmp ule i8 %1, %arg
+  %or = and i1 %cmp, %cmp.1
+  %res = select i1 %or, i32 42, i32 57
+  ret i32 %res
+}
+
+define i32 @test_signext_h(i16* %ptr, i16 signext %arg) {
+; CHECK-LABEL: @test_signext_h(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP3]], 32768
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 42, i32 20894
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %0 = load i16, i16* %ptr, align 1
+  %1 = add nuw nsw i16 %0, %arg
+  %cmp = icmp ult i16 %1, 32768
+  %res = select i1 %cmp, i32 42, i32 20894
+  ret i32 %res
+}
+
diff --git a/llvm/test/Transforms/TypePromotion/ARM/switch.ll b/llvm/test/Transforms/TypePromotion/ARM/switch.ll
new file mode 100644
index 0000000000000..6736ebeea4c43
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/switch.ll
@@ -0,0 +1,291 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define void @truncate_source_phi_switch(i8* %memblock, i8* %store, i16 %arg) {
+; CHECK-LABEL: @truncate_source_phi_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PRE:%.*]] = load i8, i8* [[MEMBLOCK:%.*]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[PRE]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i16 [[ARG:%.*]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[CONV]] to i32
+; CHECK-NEXT:    br label [[HEADER:%.*]]
+; CHECK:       header:
+; CHECK-NEXT:    [[PHI_0:%.*]] = phi i32 [ [[TMP0]], [[ENTRY:%.*]] ], [ [[COUNT:%.*]], [[LATCH:%.*]] ]
+; CHECK-NEXT:    [[PHI_1:%.*]] = phi i32 [ [[TMP1]], [[ENTRY]] ], [ [[PHI_3:%.*]], [[LATCH]] ]
+; CHECK-NEXT:    [[PHI_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[COUNT]], [[LATCH]] ]
+; CHECK-NEXT:    switch i32 [[PHI_0]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 43, label [[FOR_INC_I:%.*]]
+; CHECK-NEXT:    i32 45, label [[FOR_INC_I_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       for.inc.i:
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[PHI_1]], 1
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       for.inc.i.i:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PHI_1]], 3
+; CHECK-NEXT:    br label [[LATCH]]
+; CHECK:       default:
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[PHI_0]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[SUB]], 4
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LATCH]], label [[EXIT:%.*]]
+; CHECK:       latch:
+; CHECK-NEXT:    [[PHI_3]] = phi i32 [ [[XOR]], [[FOR_INC_I]] ], [ [[AND]], [[FOR_INC_I_I]] ], [ [[PHI_2]], [[DEFAULT]] ]
+; CHECK-NEXT:    [[COUNT]] = add nuw i32 [[PHI_2]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[COUNT]] to i8
+; CHECK-NEXT:    store i8 [[TMP2]], i8* [[STORE:%.*]], align 1
+; CHECK-NEXT:    br label [[HEADER]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %pre = load i8, i8* %memblock, align 1
+  %conv = trunc i16 %arg to i8
+  br label %header
+
+header:
+  %phi.0 = phi i8 [ %pre, %entry ], [ %count, %latch ]
+  %phi.1 = phi i8 [ %conv, %entry ], [ %phi.3, %latch ]
+  %phi.2 = phi i8 [ 0, %entry], [ %count, %latch ]
+  switch i8 %phi.0, label %default [
+  i8 43, label %for.inc.i
+  i8 45, label %for.inc.i.i
+  ]
+
+for.inc.i:
+  %xor = xor i8 %phi.1, 1
+  br label %latch
+
+for.inc.i.i:
+  %and = and i8 %phi.1, 3
+  br label %latch
+
+default:
+  %sub = sub i8 %phi.0, 1
+  %cmp2 = icmp ugt i8 %sub, 4
+  br i1 %cmp2, label %latch, label %exit
+
+latch:
+  %phi.3 = phi i8 [ %xor, %for.inc.i ], [ %and, %for.inc.i.i ], [ %phi.2, %default ]
+  %count = add nuw i8 %phi.2, 1
+  store i8 %count, i8* %store, align 1
+  br label %header
+
+exit:
+  ret void
+}
+
+define i16 @icmp_switch_source(i16 zeroext %arg) {
+; CHECK-LABEL: @icmp_switch_source(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = add nuw i32 [[TMP0]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], 3
+; CHECK-NEXT:    switch i32 [[TMP0]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i32 1, label [[SW_BB_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult i32 [[MUL]], 127
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP0]], i32 [[MUL]], i32 127
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       sw.bb.i:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[MUL]], 34
+; CHECK-NEXT:    [[SELECT_I:%.*]] = select i1 [[CMP1]], i32 [[MUL]], i32 34
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SELECT]], [[SW_BB]] ], [ [[SELECT_I]], [[SW_BB_I]] ], [ [[MUL]], [[DEFAULT]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[RES]] to i16
+; CHECK-NEXT:    ret i16 [[TMP1]]
+;
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  switch i16 %arg, label %default [
+  i16 0, label %sw.bb
+  i16 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+define i16 @icmp_switch_narrow_source(i8 zeroext %arg) {
+; CHECK-LABEL: @icmp_switch_narrow_source(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[ADD]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
+; CHECK-NEXT:    switch i8 [[TMP1]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i8 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i8 1, label [[SW_BB_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult i32 [[MUL]], 127
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP0]], i32 [[MUL]], i32 127
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       sw.bb.i:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[MUL]], 34
+; CHECK-NEXT:    [[SELECT_I:%.*]] = select i1 [[CMP1]], i32 [[MUL]], i32 34
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SELECT]], [[SW_BB]] ], [ [[SELECT_I]], [[SW_BB_I]] ], [ [[MUL]], [[DEFAULT]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[RES]] to i16
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
+entry:
+  %conv = zext i8 %arg to i16
+  %add = add nuw i16 %conv, 15
+  %mul = mul nuw nsw i16 %add, 3
+  switch i8 %arg, label %default [
+  i8 0, label %sw.bb
+  i8 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+define i16 @icmp_switch_trunc(i16 zeroext %arg) {
+; CHECK-LABEL: @icmp_switch_trunc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = add nuw i32 [[TMP0]], 15
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[TMP0]], 7
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i3
+; CHECK-NEXT:    switch i3 [[TMP2]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i3 0, label [[SW_BB:%.*]]
+; CHECK-NEXT:    i3 1, label [[SW_BB_I:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       sw.bb:
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp ult i32 [[MUL]], 127
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[CMP0]], i32 [[MUL]], i32 127
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       sw.bb.i:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[MUL]], 34
+; CHECK-NEXT:    [[SELECT_I:%.*]] = select i1 [[CMP1]], i32 [[MUL]], i32 34
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       default:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[SELECT]], [[SW_BB]] ], [ [[SELECT_I]], [[SW_BB_I]] ], [ [[MUL]], [[DEFAULT]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RES]] to i16
+; CHECK-NEXT:    ret i16 [[TMP3]]
+;
+entry:
+  %conv = add nuw i16 %arg, 15
+  %mul = mul nuw nsw i16 %conv, 3
+  %trunc = trunc i16 %arg to i3
+  switch i3 %trunc, label %default [
+  i3 0, label %sw.bb
+  i3 1, label %sw.bb.i
+  ]
+
+sw.bb:
+  %cmp0 = icmp ult i16 %mul, 127
+  %select = select i1 %cmp0, i16 %mul, i16 127
+  br label %exit
+
+sw.bb.i:
+  %cmp1 = icmp ugt i16 %mul, 34
+  %select.i = select i1 %cmp1, i16 %mul, i16 34
+  br label %exit
+
+default:
+  br label %exit
+
+exit:
+  %res = phi i16 [ %select, %sw.bb ], [ %select.i, %sw.bb.i ], [ %mul, %default ]
+  ret i16 %res
+}
+
+%class.ae = type { i8 }
+%class.x = type { i8 }
+%class.v = type { %class.q }
+%class.q = type { i16 }
+declare %class.x* @_ZNK2ae2afEv(%class.ae*) local_unnamed_addr
+declare %class.v* @_ZN1x2acEv(%class.x*) local_unnamed_addr
+
+define i32 @trunc_i16_i9_switch(%class.ae* %this) {
+; CHECK-LABEL: @trunc_i16_i9_switch(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call %class.x* @_ZNK2ae2afEv(%class.ae* [[THIS:%.*]])
+; CHECK-NEXT:    [[CALL2:%.*]] = tail call %class.v* @_ZN1x2acEv(%class.x* [[CALL]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_V:%.*]], %class.v* [[CALL2]], i32 0, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 511
+; CHECK-NEXT:    [[TRUNC:%.*]] = and i32 [[TMP3]], 448
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TRUNC]] to i9
+; CHECK-NEXT:    switch i9 [[TMP4]], label [[CLEANUP_FOLD_SPLIT:%.*]] [
+; CHECK-NEXT:    i9 0, label [[CLEANUP:%.*]]
+; CHECK-NEXT:    i9 -256, label [[IF_THEN7:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.then7:
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP2]], 7
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP5]], 0
+; CHECK-NEXT:    [[COND:%.*]] = select i1 [[TOBOOL]], i32 2, i32 1
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup.fold.split:
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = phi i32 [ [[COND]], [[IF_THEN7]] ], [ 0, [[ENTRY:%.*]] ], [ 2, [[CLEANUP_FOLD_SPLIT]] ]
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+entry:
+  %call = tail call %class.x* @_ZNK2ae2afEv(%class.ae* %this)
+  %call2 = tail call %class.v* @_ZN1x2acEv(%class.x* %call)
+  %0 = getelementptr inbounds %class.v, %class.v* %call2, i32 0, i32 0, i32 0
+  %1 = load i16, i16* %0, align 2
+  %2 = trunc i16 %1 to i9
+  %trunc = and i9 %2, -64
+  switch i9 %trunc, label %cleanup.fold.split [
+  i9 0, label %cleanup
+  i9 -256, label %if.then7
+  ]
+
+if.then7:
+  %3 = and i16 %1, 7
+  %tobool = icmp eq i16 %3, 0
+  %cond = select i1 %tobool, i32 2, i32 1
+  br label %cleanup
+
+cleanup.fold.split:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ %cond, %if.then7 ], [ 0, %entry ], [ 2, %cleanup.fold.split ]
+  ret i32 %retval.0
+}
diff --git a/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
new file mode 100644
index 0000000000000..23e50dec0ca10
--- /dev/null
+++ b/llvm/test/Transforms/TypePromotion/ARM/wrapping.ll
@@ -0,0 +1,356 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=arm -type-promotion -verify -disable-type-promotion=false -S %s -o - | FileCheck %s
+
+define zeroext i16 @overflow_add(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: @overflow_add(
+; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i16 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[OR]], 1024
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 2, i16 5
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %add = add i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_sub(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: @overflow_sub(
+; CHECK-NEXT:    [[ADD:%.*]] = sub i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i16 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[OR]], 1024
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 2, i16 5
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %add = sub i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_mul(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: @overflow_mul(
+; CHECK-NEXT:    [[ADD:%.*]] = mul i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i16 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[OR]], 1024
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 2, i16 5
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %add = mul i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define zeroext i16 @overflow_shl(i16 zeroext %a, i16 zeroext %b) {
+; CHECK-LABEL: @overflow_shl(
+; CHECK-NEXT:    [[ADD:%.*]] = shl i16 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = or i16 [[ADD]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i16 [[OR]], 1024
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i16 2, i16 5
+; CHECK-NEXT:    ret i16 [[RES]]
+;
+  %add = shl i16 %a, %b
+  %or = or i16 %add, 1
+  %cmp = icmp ugt i16 %or, 1024
+  %res = select i1 %cmp, i16 2, i16 5
+  ret i16 %res
+}
+
+define i32 @overflow_add_no_consts(i8 zeroext %a, i8 zeroext %b, i8 zeroext %limit) {
+; CHECK-LABEL: @overflow_add_no_consts(
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], [[LIMIT:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, %b
+  %cmp = icmp ugt i8 %add, %limit
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_const_limit(i8 zeroext %a, i8 zeroext %b) {
+; CHECK-LABEL: @overflow_add_const_limit(
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -128
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, %b
+  %cmp = icmp ugt i8 %add, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: @overflow_add_positive_const_limit(
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -128
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, 1
+  %cmp = icmp ugt i8 %add, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: @unsafe_add_underflow(
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[A:%.*]], -2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[ADD]], -2
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, -2
+  %cmp = icmp ugt i8 %add, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow(i8 zeroext %a) {
+; CHECK-LABEL: @safe_add_underflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP2]], 254
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, -1
+  %cmp = icmp ugt i8 %add, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_add_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: @safe_add_underflow_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP2]], 250
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %add = add i8 %a, -2
+  %cmp = icmp ule i8 %add, -6
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @overflow_sub_negative_const_limit(i8 zeroext %a) {
+; CHECK-LABEL: @overflow_sub_negative_const_limit(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], -128
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i8 %a, -1
+  %cmp = icmp ugt i8 %sub, 128
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: @unsafe_sub_underflow(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], 6
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[SUB]], -6
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i8 %a, 6
+  %cmp = icmp ugt i8 %sub, 250
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow(i8 zeroext %a) {
+; CHECK-LABEL: @safe_sub_underflow(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[SUB]], 254
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i8 %a, 1
+  %cmp = icmp ule i8 %sub, 254
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: @safe_sub_underflow_neg(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 [[TMP1]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[SUB]], 251
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i8 %a, 4
+  %cmp = icmp uge i8 %sub, -5
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @unsafe_sub_underflow_neg(i8 zeroext %a) {
+; CHECK-LABEL: @unsafe_sub_underflow_neg(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i8 [[A:%.*]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[SUB]], -3
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 8, i32 16
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i8 %a, 4
+  %cmp = icmp ult i8 %sub, -3
+  %res = select i1 %cmp, i32 8, i32 16
+  ret i32 %res
+}
+
+define i32 @safe_sub_imm_var(i8* %b) {
+; CHECK-LABEL: @safe_sub_imm_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 248, [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], 252
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV4]]
+;
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 -8, %0
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_sub_var_imm(i8* %b) {
+; CHECK-LABEL: @safe_sub_var_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[TMP1]], 248
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[SUB]], 252
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV4]]
+;
+entry:
+  %0 = load i8, i8* %b, align 1
+  %sub = sub nuw nsw i8 %0, -8
+  %cmp = icmp ugt i8 %sub, 252
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_add_imm_var(i8* %b) {
+; CHECK-LABEL: @safe_add_imm_var(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 129, [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], 127
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV4]]
+;
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 -127, %0
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i32 @safe_add_var_imm(i8* %b) {
+; CHECK-LABEL: @safe_add_var_imm(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[B:%.*]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[TMP1]], 129
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[ADD]], 127
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV4]]
+;
+entry:
+  %0 = load i8, i8* %b, align 1
+  %add = add nuw nsw i8 %0, -127
+  %cmp = icmp ugt i8 %add, 127
+  %conv4 = zext i1 %cmp to i32
+  ret i32 %conv4
+}
+
+define i8 @convert_add_order(i8 zeroext %arg) {
+; CHECK-LABEL: @convert_add_order(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG:%.*]] to i32
+; CHECK-NEXT:    [[MASK_0:%.*]] = and i32 [[TMP1]], 1
+; CHECK-NEXT:    [[MASK_1:%.*]] = and i32 [[TMP1]], 2
+; CHECK-NEXT:    [[SHL:%.*]] = or i32 [[TMP1]], 1
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[SHL]], 10
+; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i32 [[ADD]], 60
+; CHECK-NEXT:    [[TMP2:%.*]] = sub nsw i32 [[SHL]], 40
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ult i32 [[TMP2]], 20
+; CHECK-NEXT:    [[MASK_SEL:%.*]] = select i1 [[CMP_1]], i32 [[MASK_0]], i32 [[MASK_1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_0]], i32 [[MASK_SEL]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RES]] to i8
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %mask.0 = and i8 %arg, 1
+  %mask.1 = and i8 %arg, 2
+  %shl = or i8 %arg, 1
+  %add = add nuw i8 %shl, 10
+  %cmp.0 = icmp ult i8 %add, 60
+  %sub = add nsw i8 %shl, -40
+  %cmp.1 = icmp ult i8 %sub, 20
+  %mask.sel = select i1 %cmp.1, i8 %mask.0, i8 %mask.1
+  %res = select i1 %cmp.0, i8 %mask.sel, i8 %arg
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub(i32 %arg, i8 zeroext %arg1) {
+; CHECK-LABEL: @underflow_if_sub(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ARG]], [[CONV]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[AND]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TRUNC]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = add nuw nsw i32 [[TMP2]], 245
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ult i32 [[CONV1]], [[TMP1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_1]], i32 [[CONV1]], i32 100
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RES]] to i8
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %arg, %conv
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ult i8 %conv1, %arg1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
+
+define i8 @underflow_if_sub_signext(i32 %arg, i8 signext %arg1) {
+; CHECK-LABEL: @underflow_if_sub_signext(
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[ARG1:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ARG]], [[CONV]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[AND]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[TRUNC]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = add nuw nsw i32 [[TMP2]], 245
+; CHECK-NEXT:    [[CMP_1:%.*]] = icmp ugt i32 [[TMP1]], [[CONV1]]
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP_1]], i32 [[CONV1]], i32 100
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[RES]] to i8
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %cmp = icmp sgt i32 %arg, 0
+  %conv = zext i1 %cmp to i32
+  %and = and i32 %arg, %conv
+  %trunc = trunc i32 %and to i8
+  %conv1 = add nuw nsw i8 %trunc, -11
+  %cmp.1 = icmp ugt i8 %arg1, %conv1
+  %res = select i1 %cmp.1, i8 %conv1, i8 100
+  ret i8 %res
+}
diff --git a/llvm/test/tools/gold/X86/linkonce_odr_unnamed_addr.ll b/llvm/test/tools/gold/X86/linkonce_odr_unnamed_addr.ll
index 525bf2d6825fa..f8df9430eff0b 100644
--- a/llvm/test/tools/gold/X86/linkonce_odr_unnamed_addr.ll
+++ b/llvm/test/tools/gold/X86/linkonce_odr_unnamed_addr.ll
@@ -4,6 +4,7 @@
 ; RUN: opt -module-summary %s -o %t.o
 ; RUN: opt -module-summary %p/Inputs/linkonce_odr_unnamed_addr.ll -o %t2.o
 ; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN:    -m elf_x86_64 \
 ; RUN:    --plugin-opt=save-temps \
 ; RUN:    %t.o %t2.o -o %t3.o
 ; RUN: llvm-dis %t.o.1.promote.bc -o - | FileCheck %s
@@ -12,6 +13,7 @@
 ; conservative and not auto hide.
 ; RUN: llc %p/Inputs/linkonce_odr_unnamed_addr.ll -o %t2native.o -filetype=obj
 ; RUN: %gold -plugin %llvmshlibdir/LLVMgold%shlibext \
+; RUN:    -m elf_x86_64 \
 ; RUN:    --plugin-opt=save-temps \
 ; RUN:    %t.o %t2native.o -o %t3.o
 ; RUN: llvm-dis %t.o.1.promote.bc -o - | FileCheck %s --check-prefix=NOSUMMARY
diff --git a/llvm/test/tools/llvm-ar/extract.test b/llvm/test/tools/llvm-ar/extract.test
index 278a89db005af..fec5e4bb99693 100644
--- a/llvm/test/tools/llvm-ar/extract.test
+++ b/llvm/test/tools/llvm-ar/extract.test
@@ -1,31 +1,36 @@
 ## Test extract operation.
 # XFAIL: system-darwin
 
-RUN: rm -rf %t && mkdir -p %t/extracted/
+# RUN: rm -rf %t && mkdir -p %t/extracted/
 
-# Extracting from an empty archive should not warn or error:
-RUN: llvm-ar cr %t/empty.a
-RUN: llvm-ar xv %t/empty.a 2>&1 | count 0
+## Extracting from an empty archive should not warn or error:
+# RUN: llvm-ar cr %t/empty.a
+# RUN: llvm-ar xv %t/empty.a 2>&1 | count 0
 
-RUN: echo filea > %t/a.txt
-RUN: echo fileb > %t/b.txt
-RUN: llvm-ar rc %t/archive.a %t/a.txt %t/b.txt
+# RUN: echo filea > %t/a.txt
+# RUN: echo fileb > %t/b.txt
+# RUN: llvm-ar rc %t/archive.a %t/a.txt %t/b.txt
 
-# Single member:
-RUN: cd %t/extracted && llvm-ar xv %t/archive.a a.txt | FileCheck %s --check-prefix=A
-RUN: diff %t/a.txt %t/extracted/a.txt 
-A: x - a.txt
+## Single member:
+# RUN: cd %t/extracted && llvm-ar xv %t/archive.a a.txt | FileCheck %s --check-prefix=A
+# RUN: diff %t/a.txt %t/extracted/a.txt
+# A: x - a.txt
 
-# All members:
-RUN: rm %t/extracted/a.txt
-RUN: cd %t/extracted && llvm-ar xv %t/archive.a | FileCheck %s --check-prefix=AB
-RUN: diff %t/a.txt %t/extracted/a.txt 
-RUN: diff %t/b.txt %t/extracted/b.txt 
-AB: x - a.txt
-AB: x - b.txt
+## All members:
+# RUN: rm %t/extracted/a.txt
+# RUN: cd %t/extracted && llvm-ar xv %t/archive.a | FileCheck %s --check-prefix=AB
+# RUN: diff %t/a.txt %t/extracted/a.txt
+# RUN: diff %t/b.txt %t/extracted/b.txt
+# AB: x - a.txt
+# AB: x - b.txt
 
-# No output if 'v' is not specified.
-RUN: rm a.txt b.txt
-RUN: llvm-ar x %t/archive.a 2>&1 | count 0
-RUN: diff %t/a.txt %t/extracted/a.txt
-RUN: diff %t/b.txt %t/extracted/b.txt
+## Thin archive 
+# RUN: llvm-ar Trc %t/thin-archive.a
+# RUN: not llvm-ar x %t/thin-archive.a 2>&1 | FileCheck %s --check-prefix=THIN
+# THIN: extracting from a thin archive is not supported
+
+## No output if 'v' is not specified.
+# RUN: rm a.txt b.txt
+# RUN: llvm-ar x %t/archive.a 2>&1 | count 0
+# RUN: diff %t/a.txt %t/extracted/a.txt
+# RUN: diff %t/b.txt %t/extracted/b.txt
diff --git a/llvm/test/tools/llvm-ar/full-to-thin-archive.test b/llvm/test/tools/llvm-ar/full-to-thin-archive.test
new file mode 100644
index 0000000000000..6853005a32f86
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/full-to-thin-archive.test
@@ -0,0 +1,8 @@
+## Test archives do not convert to thin archives.
+
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-ar qc %t/archive.a %s
+# RUN: not llvm-ar qT %t/archive.a %s 2>&1 | FileCheck %s
+# RUN: not llvm-ar rT %t/archive.a %s 2>&1 | FileCheck %s
+
+# CHECK: error: cannot convert a regular archive to a thin one
diff --git a/llvm/test/tools/llvm-ar/missing-thin-archive-member.test b/llvm/test/tools/llvm-ar/missing-thin-archive-member.test
new file mode 100644
index 0000000000000..c1d67cdce2f1f
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/missing-thin-archive-member.test
@@ -0,0 +1,17 @@
+## Test llvm-ar errors when thin archive members are missing.
+
+# RUN: rm -rf %t-archive.a
+# RUN: echo contents > %t-temp.txt
+
+## File is not in archive.
+# RUN: llvm-ar qT %t-archive.a
+# RUN: not llvm-ar p %t-archive.a %t-temp.txt 2>&1 | FileCheck %s -DPATH=%t-temp.txt --check-prefix=MISSING
+
+# MISSING: error: '[[PATH]]' was not found
+
+## File has been deleted.
+# RUN: llvm-ar qT %t-archive.a %t-temp.txt
+# RUN: rm %t-temp.txt
+# RUN: not llvm-ar p %t-archive.a 2>&1 | FileCheck %s --check-prefix=DELETED
+
+# DELETED: error: {{[Nn]}}o such file or directory
diff --git a/llvm/test/tools/llvm-ar/print.test b/llvm/test/tools/llvm-ar/print.test
index ec7fdc4137d29..847040f419b4e 100644
--- a/llvm/test/tools/llvm-ar/print.test
+++ b/llvm/test/tools/llvm-ar/print.test
@@ -1,84 +1,89 @@
 ## Test Print output
 # XFAIL: system-darwin
 
-RUN: rm -rf %t && mkdir -p %t
-RUN: echo file1 > %t/1.txt
-RUN: echo file2 > %t/2.txt
-RUN: echo file3 > %t/3.txt
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: echo file1 > %t/1.txt
+# RUN: echo file2 > %t/2.txt
+# RUN: echo file3 > %t/3.txt
 
-RUN: llvm-ar -rc %t/archive.a %t/1.txt %t/2.txt %t/3.txt
+# RUN: llvm-ar -rc %t/archive.a %t/1.txt %t/2.txt %t/3.txt
 
-# Print without member:
-RUN: llvm-ar p %t/archive.a \
-RUN:   | FileCheck %s --check-prefix=WITHOUT --match-full-lines --implicit-check-not {{.}}
+## Print without member:
+# RUN: llvm-ar p %t/archive.a \
+# RUN:   | FileCheck %s --check-prefix=WITHOUT --match-full-lines --implicit-check-not {{.}}
 
-WITHOUT:      file1
-WITHOUT-NEXT: file2
-WITHOUT-NEXT: file3
+# WITHOUT:      file1
+# WITHOUT-NEXT: file2
+# WITHOUT-NEXT: file3
 
-RUN: llvm-ar pv %t/archive.a \
-RUN:   | FileCheck %s --check-prefix=WITHOUT-VERBOSE --match-full-lines --implicit-check-not {{.}}
+# RUN: llvm-ar pv %t/archive.a \
+# RUN:   | FileCheck %s --check-prefix=WITHOUT-VERBOSE --match-full-lines --implicit-check-not {{.}}
 
-WITHOUT-VERBOSE:      Printing 1.txt
-WITHOUT-VERBOSE-NEXT: file1
-WITHOUT-VERBOSE-NEXT: Printing 2.txt
-WITHOUT-VERBOSE-NEXT: file2
-WITHOUT-VERBOSE-NEXT: Printing 3.txt
-WITHOUT-VERBOSE-NEXT: file3
+# WITHOUT-VERBOSE:      Printing 1.txt
+# WITHOUT-VERBOSE-NEXT: file1
+# WITHOUT-VERBOSE-NEXT: Printing 2.txt
+# WITHOUT-VERBOSE-NEXT: file2
+# WITHOUT-VERBOSE-NEXT: Printing 3.txt
+# WITHOUT-VERBOSE-NEXT: file3
 
-# Print single member:
-RUN: llvm-ar p %t/archive.a %t/2.txt  \
-RUN:   | FileCheck %s --check-prefix=SINGLE --match-full-lines --implicit-check-not {{.}}
+## Print single member:
+# RUN: llvm-ar p %t/archive.a %t/2.txt  \
+# RUN:   | FileCheck %s --check-prefix=SINGLE --match-full-lines --implicit-check-not {{.}}
 
-SINGLE: file2
+# SINGLE: file2
 
-RUN: llvm-ar pv %t/archive.a %t/2.txt  \
-RUN:   | FileCheck %s --check-prefix=SINGLE-VERBOSE --match-full-lines --implicit-check-not {{.}}
+# RUN: llvm-ar pv %t/archive.a %t/2.txt  \
+# RUN:   | FileCheck %s --check-prefix=SINGLE-VERBOSE --match-full-lines --implicit-check-not {{.}}
 
-SINGLE-VERBOSE:      Printing 2.txt
-SINGLE-VERBOSE-NEXT: file2
+# SINGLE-VERBOSE:      Printing 2.txt
+# SINGLE-VERBOSE-NEXT: file2
 
-# Print multiple members:
-RUN: llvm-ar p %t/archive.a %t/2.txt %t/1.txt \
-RUN:   | FileCheck %s --check-prefix=MULTIPLE --match-full-lines --implicit-check-not {{.}}
+## Print multiple members:
+# RUN: llvm-ar p %t/archive.a %t/2.txt %t/1.txt \
+# RUN:   | FileCheck %s --check-prefix=MULTIPLE --match-full-lines --implicit-check-not {{.}}
 
-MULTIPLE:      file1
-MULTIPLE-NEXT: file2
+# MULTIPLE:      file1
+# MULTIPLE-NEXT: file2
 
-RUN: llvm-ar pv %t/archive.a %t/2.txt %t/1.txt \
-RUN:   | FileCheck %s --check-prefix=MULTIPLE-VERBOSE --match-full-lines --implicit-check-not {{.}}
+# RUN: llvm-ar pv %t/archive.a %t/2.txt %t/1.txt \
+# RUN:   | FileCheck %s --check-prefix=MULTIPLE-VERBOSE --match-full-lines --implicit-check-not {{.}}
 
-MULTIPLE-VERBOSE:      Printing 1.txt
-MULTIPLE-VERBOSE-NEXT: file1
-MULTIPLE-VERBOSE-NEXT: Printing 2.txt
-MULTIPLE-VERBOSE-NEXT: file2
+# MULTIPLE-VERBOSE:      Printing 1.txt
+# MULTIPLE-VERBOSE-NEXT: file1
+# MULTIPLE-VERBOSE-NEXT: Printing 2.txt
+# MULTIPLE-VERBOSE-NEXT: file2
 
-# Print same member:
-RUN: not llvm-ar p %t/archive.a %t/2.txt %t/2.txt 2>&1 \
-RUN:   | FileCheck %s --check-prefix=SAME -DFILE=%t/2.txt
+## Print same member:
+# RUN: not llvm-ar p %t/archive.a %t/2.txt %t/2.txt 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=SAME -DFILE=%t/2.txt
 
-SAME-DAG: file2
-SAME-DAG: error: '[[FILE]]' was not found
+# SAME-DAG: file2
+# SAME-DAG: error: '[[FILE]]' was not found
 
-# Print same member when containing multiple members with shared name:
-llvm-ar -q %t/archive.a %t/2.txt
-RUN: not llvm-ar p %t/archive.a %t/2.txt %t/2.txt 2>&1 \
-RUN:   | FileCheck %s --check-prefix=SAME -DFILE=%t/2.txt
+## Print same member when containing multiple members with shared name:
+# llvm-ar -q %t/archive.a %t/2.txt
+# RUN: not llvm-ar p %t/archive.a %t/2.txt %t/2.txt 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=SAME -DFILE=%t/2.txt
 
-# No archive:
-RUN: not llvm-ar p 2>&1 \
-RUN:   | FileCheck %s --check-prefix=NO-ARCHIVE
+## No archive:
+# RUN: not llvm-ar p 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=NO-ARCHIVE
+# 
+# NO-ARCHIVE: error: an archive name must be specified
 
-NO-ARCHIVE: error: an archive name must be specified
+## Archive does not exist:
+# RUN: not llvm-ar p %t/missing.a 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=MISSING-ARCHIVE -DARCHIVE=%t/missing.a
 
-# Archive does not exist:
-RUN: not llvm-ar p %t/missing.a 2>&1 \
-RUN:   | FileCheck %s --check-prefix=MISSING-ARCHIVE -DARCHIVE=%t/missing.a
+# MISSING-ARCHIVE: error: error loading '[[ARCHIVE]]': {{[nN]}}o such file or directory
 
-MISSING-ARCHIVE: error: error loading '[[ARCHIVE]]': {{[nN]}}o such file or directory
+## Member does not exist:
+# RUN: not llvm-ar p %t/archive.a %t-missing.txt 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=MISSING-FILE -DFILE=%t-missing.txt
 
-# Member does not exist:
-RUN: not llvm-ar p %t/archive.a %t-missing.txt 2>&1 \
-RUN:   | FileCheck %s --check-prefix=MISSING-FILE -DFILE=%t-missing.txt
+# MISSING-FILE: error: '[[FILE]]' was not found
 
-MISSING-FILE: error: '[[FILE]]' was not found
+## Print thin archive:
+# RUN: llvm-ar Trc %t/thin-archive.a %t/1.txt %t/2.txt %t/3.txt
+# RUN: llvm-ar p %t/archive.a %t/2.txt \
+# RUN:   | FileCheck %s --check-prefix=SINGLE --match-full-lines --implicit-check-not {{.}}
diff --git a/llvm/test/tools/llvm-ar/quick-append.test b/llvm/test/tools/llvm-ar/quick-append.test
index 857b01a7aa174..0df538eed0dd7 100644
--- a/llvm/test/tools/llvm-ar/quick-append.test
+++ b/llvm/test/tools/llvm-ar/quick-append.test
@@ -62,6 +62,16 @@
 
 # MISSING-FILE: error: [[FILE]]: {{[nN]}}o such file or directory
 
+## Create and append members to a thin archive:
+# RUN: llvm-ar qcT %t/thin-multiple.a %t/1.o
+# RUN: llvm-ar qcT %t/thin-multiple.a %t/2.o
+# RUN: llvm-ar t %t/thin-multiple.a \
+# RUN:   | FileCheck %s --check-prefix=MULTIPLE
+
+# RUN: llvm-ar qcT %t/thin-same.a %t/1.o %t/1.o
+# RUN: llvm-ar t %t/thin-same.a \
+# RUN:   | FileCheck %s --check-prefix=SAME
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/test/tools/llvm-ar/replace.test b/llvm/test/tools/llvm-ar/replace.test
index 9e38d5a7502cf..389ac60270c4f 100644
--- a/llvm/test/tools/llvm-ar/replace.test
+++ b/llvm/test/tools/llvm-ar/replace.test
@@ -93,6 +93,15 @@
 
 # MISSING-FILE: error: [[FILE]]: {{[Nn]}}o such file or directory
 
+## Create and Replace member of thin archive:
+# RUN: llvm-ar rcT %t/thin.a %t/1.o %t/2.o %t/3.o
+# RUN: yaml2obj %s -o %t/1.o --docnum=4
+# RUN: llvm-ar rT %t/thin.a %t/1.o
+# RUN: llvm-ar t %t/thin.a | FileCheck %s --check-prefix=SINGLE
+
+# RUN: llvm-nm --print-armap %t/thin.a \
+# RUN:   | FileCheck %s --check-prefix=SINGLE-SYM
+
 --- !ELF
 FileHeader:
   Class:   ELFCLASS64
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists.s b/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists.s
index 7e97fe902376b..ffd8f2ece78fd 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_loclists.s
@@ -1,13 +1,68 @@
 # RUN: llvm-mc %s -filetype obj -triple x86_64-pc-linux -o %t
-# RUN: llvm-dwarfdump %t | FileCheck %s
+# RUN: llvm-dwarfdump -debug-info -debug-loclists %t \
+# RUN:   | FileCheck %s --check-prefix=REGULAR --check-prefix=BOTH
+# RUN: llvm-dwarfdump -debug-info -debug-loclists --verbose %t \
+# RUN:   | FileCheck %s --check-prefix=VERBOSE --check-prefix=BOTH
 
 
-# CHECK:          DW_AT_location        (0x0000000c
-# CHECK-NEXT:        [0x0000000000000000, 0x0000000000000001): DW_OP_reg0 RAX
-# CHECK-NEXT:        [0x0000000000000001, 0x0000000000000002): DW_OP_reg1 RDX
-# CHECK-NEXT:        [0x0000000000000002, 0x0000000000000003): DW_OP_reg2 RCX
-# CHECK-NEXT:        [0x0000000000000003, 0x0000000000000004): DW_OP_reg3 RBX
-# CHECK-NEXT:        DW_LLE_startx_length (0x000000000000dead, 0x0000000000000001): DW_OP_reg4 RSI)
+# BOTH:          DW_AT_location {{.*}}(0x0000000c
+
+# REGULAR-NEXT:      [0x0000000000000000, 0x0000000000000001): DW_OP_reg0 RAX
+# VERBOSE-NEXT:      [0x0000000000000000, 0x0000000000000001) ".text": DW_OP_reg0 RAX
+
+# REGULAR-NEXT:      [0x0000000000000001, 0x0000000000000002): DW_OP_reg1 RDX
+# VERBOSE-NEXT:      [0x0000000000000001, 0x0000000000000002) ".text": DW_OP_reg1 RDX
+
+# REGULAR-NEXT:      [0x0000000000000002, 0x0000000000000003): DW_OP_reg2 RCX
+# VERBOSE-NEXT:      [0x0000000000000002, 0x0000000000000003) ".text": DW_OP_reg2 RCX
+
+# BOTH-NEXT:         <default>: DW_OP_reg3 RBX
+
+# REGULAR-NEXT:      [0x0000000000000004, 0x0000000000000005): DW_OP_reg4 RSI
+# VERBOSE-NEXT:      [0x0000000000000004, 0x0000000000000005) ".text": DW_OP_reg4 RSI
+
+# REGULAR-NEXT:      [0x0000000000000005, 0x0000000000000006): DW_OP_reg5 RDI
+# VERBOSE-NEXT:      [0x0000000000000005, 0x0000000000000006) ".text": DW_OP_reg5 RDI
+
+# REGULAR-NEXT:      [0x0000000000000006, 0x0000000000000007): DW_OP_reg6 RBP
+# VERBOSE-NEXT:      [0x0000000000000006, 0x0000000000000007) ".text": DW_OP_reg6 RBP
+
+# REGULAR-NEXT:      [0x0000000000000007, 0x0000000000000008): DW_OP_reg7 RSP
+# VERBOSE-NEXT:      [0x0000000000000007, 0x0000000000000008) ".text": DW_OP_reg7 RSP
+
+# BOTH-NEXT:         DW_LLE_startx_length (0x000000000000dead, 0x0000000000000001): DW_OP_reg4 RSI)
+
+# BOTH: locations list header: length = 0x00000056, version = 0x0005, addr_size = 0x08, seg_size = 0x00, offset_entry_count = 0x00000000
+# BOTH-NEXT: 0x0000000c:
+# BOTH-NEXT:     DW_LLE_startx_endx     (0x0000000000000000, 0x0000000000000001): DW_OP_reg0 RAX
+# BOTH-NEXT:     DW_LLE_startx_length   (0x0000000000000001, 0x0000000000000001): DW_OP_reg1 RDX
+# BOTH-NEXT:     DW_LLE_offset_pair     (0x0000000000000002, 0x0000000000000003): DW_OP_reg2 RCX
+
+# REGULAR-NEXT:  <default>: DW_OP_reg3 RBX
+# VERBOSE-NEXT:  DW_LLE_default_location()
+# VERBOSE-NEXT:            => <default>: DW_OP_reg3 RBX
+
+# REGULAR-NEXT:  [0x0000000000000004, 0x0000000000000005): DW_OP_reg4 RSI
+# VERBOSE-NEXT:  DW_LLE_start_end       (0x0000000000000004, 0x0000000000000005) ".text"
+# VERBOSE-NEXT:            => [0x0000000000000004, 0x0000000000000005) ".text": DW_OP_reg4 RSI
+
+# REGULAR-NEXT:  [0x0000000000000005, 0x0000000000000006): DW_OP_reg5 RDI
+# VERBOSE-NEXT:  DW_LLE_start_length    (0x0000000000000005, 0x0000000000000001) ".text"
+# VERBOSE-NEXT:            => [0x0000000000000005, 0x0000000000000006) ".text": DW_OP_reg5 RDI
+
+# BOTH-NEXT:     DW_LLE_base_addressx   (0x0000000000000002)
+
+# BOTH-NEXT:     DW_LLE_offset_pair     (0x0000000000000000, 0x0000000000000001): DW_OP_reg6 RBP
+
+# VERBOSE-NEXT:  DW_LLE_base_address    (0x0000000000000007) ".text"
+
+# REGULAR-NEXT:  [0x0000000000000007, 0x0000000000000008): DW_OP_reg7 RSP
+# VERBOSE-NEXT:  DW_LLE_offset_pair     (0x0000000000000000, 0x0000000000000001)
+# VERBOSE-NEXT:            => [0x0000000000000007, 0x0000000000000008) ".text": DW_OP_reg7 RSP
+
+# BOTH-NEXT:     DW_LLE_startx_length   (0x000000000000dead, 0x0000000000000001): DW_OP_reg4 RSI
+
+# VERBOSE-NEXT:  DW_LLE_end_of_list     ()
 
 
         .text
@@ -21,6 +76,14 @@ f:                                      # @f
 .Lf3:
         nop
 .Lf4:
+        nop
+.Lf5:
+        nop
+.Lf6:
+        nop
+.Lf7:
+        nop
+.Lf8:
 .Lfend:
                                         # -- End function
         .section        .debug_loclists,"",@progbits
@@ -32,33 +95,64 @@ f:                                      # @f
         .long   0                       # Offset entry count
 .Lloclists_table_base0:
 .Ldebug_loc0:
-        .byte   3                       # DW_LLE_startx_length
+        .byte   2                       # DW_LLE_startx_endx
         .uleb128 0                      #   start idx
-        .uleb128 .Lf1-.Lf0              #   length
+        .uleb128 1                      #   end idx
         .byte   1                       # Loc expr size
         .byte   80                      # super-register DW_OP_reg0
-        .byte   4                       # DW_LLE_offset_pair
-        .uleb128 .Lf1-.Lf0              #   starting offset
-        .uleb128 .Lf2-.Lf0              #   ending offset
+
+        .byte   3                       # DW_LLE_startx_length
+        .uleb128 1                      #   start idx
+        .uleb128 .Lf2-.Lf1              #   length
         .byte   1                       # Loc expr size
         .byte   81                      # super-register DW_OP_reg1
-        .byte   8                       # DW_LLE_start_length
-        .quad   .Lf2                    #   starting offset
-        .uleb128 .Lf3-.Lf2              #   length
+
+        .byte   4                       # DW_LLE_offset_pair
+        .uleb128 .Lf2-.Lf0              #   starting offset
+        .uleb128 .Lf3-.Lf0              #   ending offset
         .byte   1                       # Loc expr size
         .byte   82                      # super-register DW_OP_reg2
+
+        .byte   5                       # DW_LLE_default_location
+        .byte   1                       # Loc expr size
+        .byte   83                      # super-register DW_OP_reg3
+
+        .byte   7                       # DW_LLE_start_end
+        .quad   .Lf4                    #   starting offset
+        .quad   .Lf5                    #   ending offset
+        .byte   1                       # Loc expr size
+        .byte   84                      # super-register DW_OP_reg4
+
+        .byte   8                       # DW_LLE_start_length
+        .quad   .Lf5                    #   starting offset
+        .uleb128 .Lf6-.Lf5              #   length
+        .byte   1                       # Loc expr size
+        .byte   85                      # super-register DW_OP_reg5
+
+        .byte   1                       # DW_LLE_base_addressx
+        .uleb128 2                      #   base address
+
+        .byte   4                       # DW_LLE_offset_pair
+        .uleb128 .Lf6-.Lf6              #   starting offset
+        .uleb128 .Lf7-.Lf6              #   ending offset
+        .byte   1                       # Loc expr size
+        .byte   86                      # super-register DW_OP_reg6
+
         .byte   6                       # DW_LLE_base_address
-        .quad   .Lf3                    #   base address
+        .quad   .Lf7                    #   base address
+
         .byte   4                       # DW_LLE_offset_pair
-        .uleb128 .Lf3-.Lf3              #   starting offset
-        .uleb128 .Lf4-.Lf3              #   ending offset
+        .uleb128 .Lf7-.Lf7              #   starting offset
+        .uleb128 .Lf8-.Lf7              #   ending offset
         .byte   1                       # Loc expr size
-        .byte   83                      # super-register DW_OP_reg3
+        .byte   87                      # super-register DW_OP_reg7
+
         .byte   3                       # DW_LLE_startx_length
         .uleb128 0xdead                 #   start idx
         .uleb128 .Lf1-.Lf0              #   length
         .byte   1                       # Loc expr size
         .byte   84                      # super-register DW_OP_reg4
+
         .byte   0                       # DW_LLE_end_of_list
 .Ldebug_loclist_table_end0:
 
@@ -123,4 +217,6 @@ f:                                      # @f
         .byte   0                       # Segment selector size
 .Laddr_table_base0:
         .quad   .Lf0
+        .quad   .Lf1
+        .quad   .Lf6
 .Ldebug_addr_end0:
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/no_debug_addr.s b/llvm/test/tools/llvm-dwarfdump/X86/no_debug_addr.s
index ce1ae23cf8dc3..bf660679837be 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/no_debug_addr.s
+++ b/llvm/test/tools/llvm-dwarfdump/X86/no_debug_addr.s
@@ -4,7 +4,7 @@
 ## Ensure bogus empty section names are not printed when dumping
 ## rnglists that reference debug_addr when it is not present (such as in .dwo files)
 
-# CHECK:       DW_AT_ranges [DW_FORM_rnglistx]   (indexed (0x0) rangelist = 0x00000004
+# CHECK:       DW_AT_ranges [DW_FORM_rnglistx]   (indexed (0x0) rangelist = 0x00000010
 # CHECK-NEXT:    [0x0000000000000000, 0x0000000000000001)
 # CHECK-NEXT:    [0x0000000000000000, 0x0000000000000002))
 
diff --git a/llvm/test/tools/llvm-exegesis/X86/latency-SQRTSSr.s b/llvm/test/tools/llvm-exegesis/X86/latency-SQRTSSr.s
new file mode 100644
index 0000000000000..1908b9a9e0736
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/latency-SQRTSSr.s
@@ -0,0 +1,13 @@
+# RUN: llvm-exegesis -mode=latency -opcode-name=SQRTSSr -repetition-mode=loop | FileCheck %s
+
+# Check that the setup code for MXCSR does not crash the snippet.
+
+CHECK:      ---
+CHECK-NEXT: mode: latency
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     SQRTSSr
+CHECK-NEXT: config: ''
+CHECK-NEXT: register_initial_values:
+CHECK-NOT: crashed
+CHECK-LAST: ...
diff --git a/llvm/test/tools/llvm-exegesis/X86/uops-ADD_F32m.s b/llvm/test/tools/llvm-exegesis/X86/uops-ADD_F32m.s
new file mode 100644
index 0000000000000..023fa78282ba6
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/uops-ADD_F32m.s
@@ -0,0 +1,9 @@
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD_F32m -repetition-mode=duplicate | FileCheck %s
+# RUN: llvm-exegesis -mode=uops -opcode-name=ADD_F32m -repetition-mode=loop | FileCheck %s
+
+CHECK:      mode:            uops
+CHECK-NEXT: key:
+CHECK-NEXT:   instructions:
+CHECK-NEXT:     ADD_F32m
+CHECK:      register_initial_values:
+CHECK:      FPCW
diff --git a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
index 84a928cb23d8a..9044fbb4353d2 100644
--- a/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
+++ b/llvm/test/tools/llvm-exegesis/X86/uops-VFMADDSS4rm.s
@@ -5,3 +5,5 @@ CHECK:      mode:            uops
 CHECK-NEXT: key:
 CHECK-NEXT:   instructions:
 CHECK-NEXT:     VFMADDSS4rm
+CHECK:      register_initial_values:
+CHECK:      MXCSR
diff --git a/llvm/test/tools/llvm-objcopy/MachO/strip-debug.test b/llvm/test/tools/llvm-objcopy/MachO/strip-debug.test
index d53271ba31e51..ff99b97f1047f 100644
--- a/llvm/test/tools/llvm-objcopy/MachO/strip-debug.test
+++ b/llvm/test/tools/llvm-objcopy/MachO/strip-debug.test
@@ -18,6 +18,9 @@
 # RUN: llvm-readobj -r %t | FileCheck %s --check-prefixes=RELOC,DEBUG
 # RUN: llvm-readobj -r %t.stripped | FileCheck %s --check-prefix=RELOC
 
+# RUN: llvm-strip --strip-debug %t -o %t.stripped2
+# RUN: cmp %t.stripped %t.stripped2
+
 # RELOC:      Relocations [
 # RELOC-NEXT:   Section __text {
 # RELOC-NEXT:     0x0 0 0 0 X86_64_RELOC_UNSIGNED 0 -
diff --git a/llvm/test/tools/llvm-objdump/elf-pt-gnu-property.test b/llvm/test/tools/llvm-objdump/elf-pt-gnu-property.test
new file mode 100644
index 0000000000000..246337866a777
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/elf-pt-gnu-property.test
@@ -0,0 +1,14 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-objdump -p %t | FileCheck %s
+
+# CHECK: Program Header:
+# CHECK-NEXT: {{ }}PROPERTY{{ }}
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+ProgramHeaders:
+  - Type: PT_GNU_PROPERTY
diff --git a/llvm/test/tools/llvm-readobj/Inputs/many-sections-stripped.elf-x86_64 b/llvm/test/tools/llvm-readobj/Inputs/many-sections-stripped.elf-x86_64
deleted file mode 100644
index a589dc5d6a9c5..0000000000000
Binary files a/llvm/test/tools/llvm-readobj/Inputs/many-sections-stripped.elf-x86_64 and /dev/null differ
diff --git a/llvm/test/tools/llvm-readobj/Inputs/many-sections.elf-x86_64 b/llvm/test/tools/llvm-readobj/Inputs/many-sections.elf-x86_64
deleted file mode 100644
index 1abb98a01b845..0000000000000
Binary files a/llvm/test/tools/llvm-readobj/Inputs/many-sections.elf-x86_64 and /dev/null differ
diff --git a/llvm/test/tools/llvm-readobj/elf-invalid-versioning.test b/llvm/test/tools/llvm-readobj/elf-invalid-versioning.test
deleted file mode 100644
index d7a5198df1005..0000000000000
--- a/llvm/test/tools/llvm-readobj/elf-invalid-versioning.test
+++ /dev/null
@@ -1,315 +0,0 @@
-## Here we test how llvm-readelf/llvm-readobj behave when inputs have
-## invalid versioning sections.
-
-## In the first case we have a SHT_GNU_versym section that refers to
-## a version listed in a SHT_GNU_verneed section. That version has an
-## empty name, making it invalid.
-
-# RUN: yaml2obj --docnum=1 %s -o %t1
-# RUN: llvm-readelf -V %t1 | FileCheck %s --check-prefix=GNU-VERNEED-NAME
-# RUN: llvm-readobj -V %t1 | FileCheck %s --check-prefix=LLVM-VERNEED-NAME
-
-# GNU-VERNEED-NAME:      Version symbols section '.gnu.version' contains 2 entries:
-# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000200210  Offset: 0x000040  Link: 5 (.dynsym)
-# GNU-VERNEED-NAME-NEXT:   000:   0 (*local*)       2 (*invalid*)
-
-# GNU-VERNEED-NAME:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 6 (.dynstr)
-# GNU-VERNEED-NAME-NEXT:   0x0000: Version: 1  File: somefile  Cnt: 1
-# GNU-VERNEED-NAME-NEXT:   0x0010:   Name:   Flags: none  Version: 2
-
-# LLVM-VERNEED-NAME:      VersionSymbols [
-# LLVM-VERNEED-NAME:        Symbol {
-# LLVM-VERNEED-NAME-NEXT:     Version: 0
-# LLVM-VERNEED-NAME-NEXT:     Name:
-# LLVM-VERNEED-NAME-NEXT:   }
-# LLVM-VERNEED-NAME-NEXT:   Symbol {
-# LLVM-VERNEED-NAME-NEXT:     Version: 2
-# LLVM-VERNEED-NAME-NEXT:     Name: foo
-# LLVM-VERNEED-NAME-NEXT:   }
-# LLVM-VERNEED-NAME-NEXT: ]
-
-# LLVM-VERNEED-NAME:      VersionRequirements [
-# LLVM-VERNEED-NAME-NEXT:   Dependency {
-# LLVM-VERNEED-NAME-NEXT:     Version: 1
-# LLVM-VERNEED-NAME-NEXT:     Count: 1
-# LLVM-VERNEED-NAME-NEXT:     FileName: somefile
-# LLVM-VERNEED-NAME-NEXT:     Entries [
-# LLVM-VERNEED-NAME-NEXT:       Entry {
-# LLVM-VERNEED-NAME-NEXT:         Hash:  0
-# LLVM-VERNEED-NAME-NEXT:         Flags [ (0x0)
-# LLVM-VERNEED-NAME-NEXT:         ]
-# LLVM-VERNEED-NAME-NEXT:         Index: 2
-# LLVM-VERNEED-NAME-NEXT:         Name: {{$}}
-# LLVM-VERNEED-NAME-NEXT:       }
-# LLVM-VERNEED-NAME-NEXT:     ]
-# LLVM-VERNEED-NAME-NEXT:   }
-# LLVM-VERNEED-NAME-NEXT: ]
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Name:    .gnu.version
-    Type:    SHT_GNU_versym
-    Flags:   [ SHF_ALLOC ]
-    Address: 0x200210
-    Link:    .dynsym
-    Entries: [ 0, 2 ]
-  - Name:  .gnu.version_r
-    Type:  SHT_GNU_verneed
-    Flags: [ SHF_ALLOC ]
-    Link:  .dynstr
-    Info:  1
-    AddressAlign: 4
-    Dependencies:
-      - Version:   1
-        File:      somefile
-        Entries:
-          - Name:  '' ## invalid name
-            Hash:  0
-            Flags: 0
-            Other: 2
-DynamicSymbols:
-  - Name:    foo
-    Binding: STB_GLOBAL
-...
-
-## In this case SHT_GNU_verneed is not linked to a dynamic string table. We check we handle
-## this situation properly.
-
-# RUN: yaml2obj --docnum=2 %s -o %t2
-# RUN: llvm-readelf -V %t2 | FileCheck %s --check-prefix=GNU-NOLINK
-# RUN: llvm-readobj -V %t2 | FileCheck %s --check-prefix=LLVM-NOLINK
-
-# GNU-NOLINK:      Version symbols section '.gnu.version' contains 2 entries:
-# GNU-NOLINK-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 5 (.dynsym)
-# GNU-NOLINK-NEXT:   000:   0 (*local*)       2 (bar)
-# GNU-NOLINK:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-NOLINK-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 0 ()
-# GNU-NOLINK-NEXT:   0x0000: Version: 1  File: <invalid>  Cnt: 1
-# GNU-NOLINK-NEXT:   0x0010:   Name: <invalid>  Flags: none Version: 2
-
-# LLVM-NOLINK:      VersionSymbols [
-# LLVM-NOLINK:        Symbol {
-# LLVM-NOLINK-NEXT:     Version: 0
-# LLVM-NOLINK-NEXT:     Name:
-# LLVM-NOLINK-NEXT:   }
-# LLVM-NOLINK-NEXT:   Symbol {
-# LLVM-NOLINK-NEXT:     Version: 2
-# LLVM-NOLINK-NEXT:     Name: foo@bar
-# LLVM-NOLINK-NEXT:   }
-# LLVM-NOLINK-NEXT: ]
-
-# LLVM-NOLINK:      VersionRequirements [
-# LLVM-NOLINK-NEXT:   Dependency {
-# LLVM-NOLINK-NEXT:     Version: 1
-# LLVM-NOLINK-NEXT:     Count: 1
-# LLVM-NOLINK-NEXT:     FileName: <invalid>
-# LLVM-NOLINK-NEXT:     Entries [
-# LLVM-NOLINK-NEXT:       Entry {
-# LLVM-NOLINK-NEXT:         Hash: 0
-# LLVM-NOLINK-NEXT:         Flags [ (0x0)
-# LLVM-NOLINK-NEXT:         ]
-# LLVM-NOLINK-NEXT:         Index: 2
-# LLVM-NOLINK-NEXT:         Name: <invalid>
-# LLVM-NOLINK-NEXT:       }
-# LLVM-NOLINK-NEXT:     ]
-# LLVM-NOLINK-NEXT:   }
-# LLVM-NOLINK-NEXT: ]
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Name:    .gnu.version
-    Type:    SHT_GNU_versym
-    Flags:   [ SHF_ALLOC ]
-    Link:    .dynsym
-    Entries: [ 0, 2 ]
-  - Name:  .gnu.version_r
-    Type:  SHT_GNU_verneed
-    Flags: [ SHF_ALLOC ]
-    Link:  0
-    Info:  1
-    AddressAlign: 4
-    Dependencies:
-      - Version: 1
-        File:    somefile
-        Entries:
-          - Name:  'bar'
-            Hash:  0
-            Flags: 0
-            Other: 2
-DynamicSymbols:
-  - Name:    foo
-    Binding: STB_GLOBAL
-
-## We can't parse misaligned auxiliary version records.
-## Here we have a SHT_GNU_verneed section aligned by 1 byte.
-## This makes the first auxiliary record offset % 4 be non-zero.
-
-# RUN: yaml2obj --docnum=3 %s -o %t3
-# RUN: not llvm-readelf -V %t3 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=BROKEN-AUX
-# RUN: not llvm-readobj -V %t3 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=BROKEN-AUX
-
-# BROKEN-AUX: error: '[[FILE]]': SHT_GNU_verneed: the vn_aux field of the entry with index 0 references a misaligned auxiliary record
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Name:    .gnu.version
-    Type:    SHT_GNU_versym
-    Flags:   [ SHF_ALLOC ]
-    Link:    .dynsym
-    Entries: [ 2 ]
-  - Name:  .gnu.version_r
-    Type:  SHT_GNU_verneed
-    Flags: [ SHF_ALLOC ]
-    Info:  1
-    AddressAlign: 1
-    Dependencies:
-      - Version: 1
-        File:    somefile
-        Entries:
-          - Name:  'bar'
-            Hash:  0
-            Flags: 0
-            Other: 2
-DynamicSymbols:
-  - Name: foo
-
-## Here we check that we can properly dump the case when a dependency file name
-## and/or a dependency name string offset is equal to the string table size.
-##
-## We set the version dependency vn_file field to the offset of string 'foo' in
-## the .dynstr, which is 1. We create a custom string table .mystrtab of size 1
-## and link it with the .gnu.version_r section. For the vna_name we use the same trick.
-
-# RUN: yaml2obj --docnum=4 %s -o %t4
-# RUN: llvm-readobj --sections --section-data -V %t4 | FileCheck %s --check-prefix=LLVM-OFFSET-EQ
-# RUN: llvm-readelf --sections -V %t4 | FileCheck %s --check-prefix=GNU-OFFSET-EQ
-
-# LLVM-OFFSET-EQ: Name: .mystrtab
-# LLVM-OFFSET-EQ: Size:
-# LLVM-OFFSET-EQ-SAME:  1
-
-# LLVM-OFFSET-EQ:      Name: .dynstr
-# LLVM-OFFSET-EQ:      SectionData (
-# LLVM-OFFSET-EQ-NEXT:   0000: 00666F6F 00 |.foo.|
-# LLVM-OFFSET-EQ-NEXT: )
-
-# LLVM-OFFSET-EQ:      VersionRequirements [
-# LLVM-OFFSET-EQ-NEXT:   Dependency {
-# LLVM-OFFSET-EQ-NEXT:     Version: 1
-# LLVM-OFFSET-EQ-NEXT:     Count: 1
-# LLVM-OFFSET-EQ-NEXT:     FileName: <invalid>
-# LLVM-OFFSET-EQ-NEXT:     Entries [
-# LLVM-OFFSET-EQ-NEXT:       Entry {
-# LLVM-OFFSET-EQ-NEXT:         Hash: 0
-# LLVM-OFFSET-EQ-NEXT:         Flags [ (0x0)
-# LLVM-OFFSET-EQ-NEXT:         ]
-# LLVM-OFFSET-EQ-NEXT:         Index: 0
-# LLVM-OFFSET-EQ-NEXT:         Name: <invalid>
-# LLVM-OFFSET-EQ-NEXT:       }
-# LLVM-OFFSET-EQ-NEXT:     ]
-# LLVM-OFFSET-EQ-NEXT:   }
-# LLVM-OFFSET-EQ-NEXT: ]
-
-# GNU-OFFSET-EQ:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-OFFSET-EQ-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 1 (.mystrtab)
-# GNU-OFFSET-EQ-NEXT:   0x0000: Version: 1  File: <invalid>  Cnt: 1
-# GNU-OFFSET-EQ-NEXT:   0x0010:   Name: <invalid>  Flags: none  Version: 0
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Name: .mystrtab
-    Type: SHT_STRTAB
-    Content: "00"
-  - Name:  .gnu.version_r
-    Type:  SHT_GNU_verneed
-    Flags: [ SHF_ALLOC ]
-    Info:  1
-    Link:  .mystrtab
-    AddressAlign: 4
-    Dependencies:
-      - Version: 1
-        File:    foo
-        Entries:
-          - Name:  'foo'
-            Hash:  0
-            Flags: 0
-            Other: 0
-DynamicSymbols:
-  - Name: foo
-
-## Here we check that we can properly dump the case when a dependency file name
-## and/or a dependency name string offset is greater than the string table size.
-##
-# RUN: yaml2obj --docnum=5 %s -o %t5
-# RUN: llvm-readobj --sections -V %t5 | FileCheck %s --check-prefix=LLVM-OFFSET-GR
-# RUN: llvm-readelf --sections -V %t5 | FileCheck %s --check-prefix=GNU-OFFSET-GR
-
-# LLVM-OFFSET-GR:      VersionRequirements [
-# LLVM-OFFSET-GR-NEXT:   Dependency {
-# LLVM-OFFSET-GR-NEXT:     Version: 1
-# LLVM-OFFSET-GR-NEXT:     Count: 1
-# LLVM-OFFSET-GR-NEXT:     FileName: <invalid>
-# LLVM-OFFSET-GR-NEXT:     Entries [
-# LLVM-OFFSET-GR-NEXT:       Entry {
-# LLVM-OFFSET-GR-NEXT:         Hash: 0
-# LLVM-OFFSET-GR-NEXT:         Flags [ (0x0)
-# LLVM-OFFSET-GR-NEXT:         ]
-# LLVM-OFFSET-GR-NEXT:         Index: 0
-# LLVM-OFFSET-GR-NEXT:         Name: <invalid>
-# LLVM-OFFSET-GR-NEXT:       }
-# LLVM-OFFSET-GR-NEXT:     ]
-# LLVM-OFFSET-GR-NEXT:   }
-# LLVM-OFFSET-GR-NEXT: ]
-
-# GNU-OFFSET-GR:      Version needs section '.gnu.version_r' contains 1 entries:
-# GNU-OFFSET-GR-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 1 (.mystrtab)
-# GNU-OFFSET-GR-NEXT:   0x0000: Version: 1  File: <invalid>  Cnt: 1
-# GNU-OFFSET-GR-NEXT:   0x0010:   Name: <invalid>  Flags: none  Version: 0
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_EXEC
-  Machine: EM_X86_64
-Sections:
-  - Name: .mystrtab
-    Type: SHT_STRTAB
-    Content: ""
-  - Name:  .gnu.version_r
-    Type:  SHT_GNU_verneed
-    Flags: [ SHF_ALLOC ]
-    Info:  1
-    Link:  .mystrtab
-    AddressAlign: 4
-    Dependencies:
-      - Version: 1
-        File:    foo
-        Entries:
-          - Name:  'foo'
-            Hash:  0
-            Flags: 0
-            Other: 0
-DynamicSymbols:
-  - Name: foo
diff --git a/llvm/test/tools/llvm-readobj/elf-pt-gnu-property.test b/llvm/test/tools/llvm-readobj/elf-pt-gnu-property.test
new file mode 100644
index 0000000000000..f47341115b5ca
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/elf-pt-gnu-property.test
@@ -0,0 +1,17 @@
+# RUN: yaml2obj %s -o %t
+# RUN: llvm-readelf --program-headers %t | FileCheck %s --check-prefix=GNU
+# RUN: llvm-readobj --program-headers %t | FileCheck %s --check-prefix=LLVM
+
+# GNU: {{ }}GNU_PROPERTY{{ }}
+
+# LLVM:        ProgramHeader {
+# LLVM-NEXT:     Type: PT_GNU_PROPERTY (0x6474E553)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+ProgramHeaders:
+  - Type: PT_GNU_PROPERTY
diff --git a/llvm/test/tools/llvm-readobj/elf-verdef-invalid.test b/llvm/test/tools/llvm-readobj/elf-verdef-invalid.test
new file mode 100644
index 0000000000000..2eb262f555c12
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/elf-verdef-invalid.test
@@ -0,0 +1,309 @@
+## Test how llvm-readobj/llvm-readelf tools handle invalid SHT_GNU_verdef sections.
+
+## Check that we report a warning when sh_link references a non-existent section.
+
+# RUN: yaml2obj %s --docnum=1 -o %t1
+# RUN: llvm-readobj -V %t1 2>&1 | FileCheck %s --check-prefix=INVALID-LINK-LLVM --implicit-check-not="warning:" -DFILE=%t1
+# RUN: llvm-readelf -V %t1 2>&1 | FileCheck %s --check-prefix=INVALID-LINK-GNU --implicit-check-not="warning:" -DFILE=%t1
+
+# INVALID-LINK-LLVM: warning: '[[FILE]]': invalid section linked to SHT_GNU_verdef section with index 1: invalid section index: 255
+
+# INVALID-LINK-GNU:      Version definition section '.gnu.version_d' contains 0 entries:
+# INVALID-LINK-GNU:      warning: '[[FILE]]': invalid section linked to SHT_GNU_verdef section with index 1: invalid section index: 255
+# INVALID-LINK-GNU-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 255 (<corrupt>)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Link:    0xFF
+    Info:    0x0
+    Entries: []
+
+## Check that we report a warning when the sh_link field of a SHT_GNU_verdef section references a non-string table section.
+
+# RUN: yaml2obj %s --docnum=2 -o %t2
+# RUN: llvm-readobj -V %t2 2>&1 | FileCheck %s --check-prefix=INVALID-STRING-TABLE -DFILE=%t2
+# RUN: llvm-readelf -V %t2 2>&1 | FileCheck %s --check-prefix=INVALID-STRING-TABLE -DFILE=%t2
+
+# INVALID-STRING-TABLE: warning: '[[FILE]]': invalid string table linked to SHT_GNU_verdef section with index 1: invalid sh_type for string table section [index 0]: expected SHT_STRTAB, but got SHT_NULL
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Link:    0x0
+    Info:    0x0
+    Entries: []
+
+## Check that we report a warning when we can't read the content of the SHT_GNU_verdef section.
+
+# RUN: yaml2obj %s --docnum=3 -o %t3
+# RUN: llvm-readobj -V %t3 2>&1 | FileCheck %s --check-prefix=INVALID-DATA -DFILE=%t3
+# RUN: llvm-readelf -V %t3 2>&1 | FileCheck %s --check-prefix=INVALID-DATA -DFILE=%t3
+
+# INVALID-DATA: warning: '[[FILE]]': cannot read content of SHT_GNU_verdef section with index 1: section [index 1] has a sh_offset (0xffffffff) + sh_size (0x0) that is greater than the file size (0x230)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:     .gnu.version_d
+    Type:     SHT_GNU_verdef
+    Link:     .dynstr
+    Info:     0x0
+    Entries:  []
+    ShOffset: 0xFFFFFFFF
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when a SHT_GNU_verdef section contains a version definition
+## that goes past the end of the section.
+
+# RUN: yaml2obj %s --docnum=4 -o %t4
+# RUN: llvm-readobj -V %t4 2>&1 | FileCheck %s --check-prefix=DEF-PAST-END -DFILE=%t4
+# RUN: llvm-readelf -V %t4 2>&1 | FileCheck %s --check-prefix=DEF-PAST-END -DFILE=%t4
+
+# DEF-PAST-END: warning: '[[FILE]]': invalid SHT_GNU_verdef section with index 1: version definition 1 goes past the end of the section
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Link:    .dynstr
+    Info:    0x1
+    Entries:
+      - Version:    1
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names:
+          - FOO
+    ShSize: 1
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when a SHT_GNU_verdef section contains a version definition
+## that refers to an auxiliary entry that goes past the end of the section.
+
+# RUN: yaml2obj %s --docnum=5 -o %t5
+# RUN: llvm-readobj -V %t5 2>&1 | FileCheck %s --check-prefix=AUX-PAST-END -DFILE=%t5
+# RUN: llvm-readelf -V %t5 2>&1 | FileCheck %s --check-prefix=AUX-PAST-END -DFILE=%t5
+
+# AUX-PAST-END: warning: '[[FILE]]': invalid SHT_GNU_verdef section with index 1: version definition 1 refers to an auxiliary entry that goes past the end of the section
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Link:    .dynstr
+    Info:    0x1
+    Entries:
+      - Version:    1
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names:
+          - FOO
+    ShSize: 21
+DynamicSymbols:
+  - Name: foo
+
+## Check that we can dump a SHT_GNU_verdef section properly even if it contains version names strings
+## that overrun the linked string table.
+
+# RUN: yaml2obj %s --docnum=6 -o %t6
+# RUN: llvm-readobj -V %t6 2>&1 | FileCheck %s --check-prefix=PAST-STRTAB-END-LLVM --implicit-check-not="warning:" -DFILE=%t6
+# RUN: llvm-readelf -V %t6 2>&1 | FileCheck %s --check-prefix=PAST-STRTAB-END-GNU --implicit-check-not="warning:" -DFILE=%t6
+
+# PAST-STRTAB-END-LLVM:      VersionDefinitions [
+# PAST-STRTAB-END-LLVM-NEXT:   Definition {
+# PAST-STRTAB-END-LLVM-NEXT:     Version: 1
+# PAST-STRTAB-END-LLVM-NEXT:     Flags [ (0x0)
+# PAST-STRTAB-END-LLVM-NEXT:     ]
+# PAST-STRTAB-END-LLVM-NEXT:     Index: 0
+# PAST-STRTAB-END-LLVM-NEXT:     Hash: 0
+# PAST-STRTAB-END-LLVM-NEXT:     Name: <invalid vda_name: 5>
+# PAST-STRTAB-END-LLVM-NEXT:     Predecessors: []
+# PAST-STRTAB-END-LLVM-NEXT:   }
+# PAST-STRTAB-END-LLVM-NEXT: ]
+
+# PAST-STRTAB-END-GNU:      Version definition section '.gnu.version_d' contains 1 entries:
+# PAST-STRTAB-END-GNU-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 2 (.strtab)
+# PAST-STRTAB-END-GNU-NEXT:   0x0000: Rev: 1  Flags: none  Index: 0  Cnt: 1  Name: <invalid vda_name: 5>
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Link:    .strtab
+    Info:    0x1
+    Entries:
+      - Version:    1
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names:
+          - FOO
+  - Name: .strtab
+    Type: SHT_STRTAB
+DynamicSymbols:
+  - Name: BAR
+
+## Check we report a warning when a version definition is not correctly aligned in memory.
+
+# RUN: yaml2obj %s --docnum=7 -o %t7
+# RUN: llvm-readobj -V %t7 2>&1 | FileCheck %s --check-prefix=MISALIGNED-DEF -DFILE=%t7
+# RUN: llvm-readelf -V %t7 2>&1 | FileCheck %s --check-prefix=MISALIGNED-DEF -DFILE=%t7
+
+# MISALIGNED-DEF: warning: '[[FILE]]': invalid SHT_GNU_verdef section with index 1: found a misaligned version definition entry at offset 0x0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Type: Fill
+    Size: 0x1
+  - Name: .gnu.version_d
+    Type: SHT_GNU_verdef
+    Link: .dynstr
+    Info: 0x1
+    Entries:
+      - Version:    1
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names:
+          - FOO
+DynamicSymbols:
+  - Name: foo
+
+## Check we report a warning when an auxiliary entry is not correctly aligned in memory.
+
+# RUN: yaml2obj %s --docnum=8 -o %t8
+# RUN: llvm-readobj -V %t8 2>&1 | FileCheck %s --check-prefix=MISALIGNED-AUX -DFILE=%t8
+# RUN: llvm-readelf -V %t8 2>&1 | FileCheck %s --check-prefix=MISALIGNED-AUX -DFILE=%t8
+
+# MISALIGNED-AUX: warning: '[[FILE]]': invalid SHT_GNU_verdef section with index 1: found a misaligned auxiliary entry at offset 0x13
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_d
+    Type:  SHT_GNU_verdef
+    Flags: [ SHF_ALLOC ]
+    Link:  .dynstr
+    Info:  0x1
+## The byte offset to the auxiliary entry is 0x13, i.e. it is not correctly aligned in memory.
+    Content: "0100000000000100000000001300000000000000"
+DynamicSymbols:
+  - Name:    foo
+    Binding: STB_GLOBAL
+
+## Check how we handle a version definition entry with an unsupported version.
+
+# RUN: yaml2obj %s --docnum=9 -o %t9
+# RUN: llvm-readobj -V %t9 2>&1 | FileCheck %s --check-prefix=UNSUPPORTED-VERSION -DFILE=%t9
+# RUN: llvm-readelf -V %t9 2>&1 | FileCheck %s --check-prefix=UNSUPPORTED-VERSION -DFILE=%t9
+
+# UNSUPPORTED-VERSION: warning: '[[FILE]]': unable to dump SHT_GNU_verdef section with index 1: version 65278 is not yet supported
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name: .gnu.version_d
+    Type: SHT_GNU_verdef
+    Link: .dynstr
+    Info: 0x1
+    Entries:
+      - Version:    0xfefe
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names: []
+DynamicSymbols:
+  - Name: foo
+
+## Check we error out when trying to print version symbols, but SHT_GNU_verdef is invalid due to any reason.
+
+# RUN: yaml2obj %s --docnum=10 -o %t10
+# RUN: not llvm-readobj -V %t10 2>&1 | FileCheck %s --check-prefix=INVALID-VERDEF-LLVM -DFILE=%t10
+# RUN: not llvm-readelf -V %t10 2>&1 | FileCheck %s --check-prefix=INVALID-VERDEF-GNU -DFILE=%t10
+
+# INVALID-VERDEF-LLVM:      VersionSymbols [
+# INVALID-VERDEF-LLVM-NEXT:    Symbol {
+# INVALID-VERDEF-LLVM-NEXT:      Version: 0
+# INVALID-VERDEF-LLVM-NEXT:      Name:
+# INVALID-VERDEF-LLVM-NEXT:    }
+# INVALID-VERDEF-LLVM-NEXT:    Symbol {
+# INVALID-VERDEF-LLVM-EMPTY:
+# INVALID-VERDEF-LLVM-NEXT:  error: '[[FILE]]': invalid SHT_GNU_verdef section with index 2: version definition 1 goes past the end of the section
+
+# INVALID-VERDEF-GNU:      Version symbols section '.gnu.version' contains 2 entries:
+# INVALID-VERDEF-GNU-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 5 (.dynsym)
+# INVALID-VERDEF-GNU-NEXT:   000:   0 (*local*)
+# INVALID-VERDEF-GNU-NEXT: error: '[[FILE]]': invalid SHT_GNU_verdef section with index 2: version definition 1 goes past the end of the section
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:         .gnu.version
+    Type:         SHT_GNU_versym
+    Flags:        [ SHF_ALLOC ]
+    Link:         .dynsym
+    AddressAlign: 0x0000000000000002
+    EntSize:      0x0000000000000002
+    Entries:      [ 0, 2 ]
+  - Name:         .gnu.version_d
+    Type:         SHT_GNU_verdef
+    Flags:        [ SHF_ALLOC ]
+    Link:         .dynstr
+    AddressAlign: 0x4
+    Info:         0x1
+    Entries: []
+DynamicSymbols:
+  - Name:    foo
+    Binding: STB_GLOBAL
diff --git a/llvm/test/tools/llvm-readobj/elf-verneed-invalid.test b/llvm/test/tools/llvm-readobj/elf-verneed-invalid.test
new file mode 100644
index 0000000000000..6a04519a05d68
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/elf-verneed-invalid.test
@@ -0,0 +1,607 @@
+## Test how llvm-readobj/llvm-readelf tools handle invalid SHT_GNU_verneed sections.
+
+## In the first case we have a SHT_GNU_versym section that refers to
+## a version listed in a SHT_GNU_verneed section. That version has an
+## empty name, making it invalid.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readelf -V %t1 | FileCheck %s --check-prefix=GNU-VERNEED-NAME
+# RUN: llvm-readobj -V %t1 | FileCheck %s --check-prefix=LLVM-VERNEED-NAME
+
+# GNU-VERNEED-NAME:      Version symbols section '.gnu.version' contains 2 entries:
+# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000200210  Offset: 0x000040  Link: 5 (.dynsym)
+# GNU-VERNEED-NAME-NEXT:   000:   0 (*local*)       2 (*invalid*)
+
+# GNU-VERNEED-NAME:      Version needs section '.gnu.version_r' contains 1 entries:
+# GNU-VERNEED-NAME-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 6 (.dynstr)
+# GNU-VERNEED-NAME-NEXT:   0x0000: Version: 1  File: somefile  Cnt: 1
+# GNU-VERNEED-NAME-NEXT:   0x0010:   Name:   Flags: none  Version: 2
+
+# LLVM-VERNEED-NAME:      VersionSymbols [
+# LLVM-VERNEED-NAME:        Symbol {
+# LLVM-VERNEED-NAME-NEXT:     Version: 0
+# LLVM-VERNEED-NAME-NEXT:     Name:
+# LLVM-VERNEED-NAME-NEXT:   }
+# LLVM-VERNEED-NAME-NEXT:   Symbol {
+# LLVM-VERNEED-NAME-NEXT:     Version: 2
+# LLVM-VERNEED-NAME-NEXT:     Name: foo
+# LLVM-VERNEED-NAME-NEXT:   }
+# LLVM-VERNEED-NAME-NEXT: ]
+
+# LLVM-VERNEED-NAME:      VersionRequirements [
+# LLVM-VERNEED-NAME-NEXT:   Dependency {
+# LLVM-VERNEED-NAME-NEXT:     Version: 1
+# LLVM-VERNEED-NAME-NEXT:     Count: 1
+# LLVM-VERNEED-NAME-NEXT:     FileName: somefile
+# LLVM-VERNEED-NAME-NEXT:     Entries [
+# LLVM-VERNEED-NAME-NEXT:       Entry {
+# LLVM-VERNEED-NAME-NEXT:         Hash:  0
+# LLVM-VERNEED-NAME-NEXT:         Flags [ (0x0)
+# LLVM-VERNEED-NAME-NEXT:         ]
+# LLVM-VERNEED-NAME-NEXT:         Index: 2
+# LLVM-VERNEED-NAME-NEXT:         Name: {{$}}
+# LLVM-VERNEED-NAME-NEXT:       }
+# LLVM-VERNEED-NAME-NEXT:     ]
+# LLVM-VERNEED-NAME-NEXT:   }
+# LLVM-VERNEED-NAME-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+    Address: 0x200210
+    Link:    .dynsym
+    Entries: [ 0, 2 ]
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Link:  .dynstr
+    Info:  1
+    AddressAlign: 4
+    Dependencies:
+      - Version:   1
+        File:      somefile
+        Entries:
+          - Name:  '' ## invalid name
+            Hash:  0
+            Flags: 0
+            Other: 2
+DynamicSymbols:
+  - Name:    foo
+    Binding: STB_GLOBAL
+...
+
+## In this case SHT_GNU_verneed is not linked to a dynamic string table. We check we handle
+## this situation properly.
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-readelf -V %t2 2>&1 | FileCheck %s -DFILE=%t2 --check-prefix=GNU-NOLINK
+# RUN: llvm-readobj -V %t2 2>&1 | FileCheck %s -DFILE=%t2 --check-prefix=LLVM-NOLINK
+
+# GNU-NOLINK:      Version symbols section '.gnu.version' contains 2 entries:
+# GNU-NOLINK-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 5 (.dynsym)
+# GNU-NOLINK-NEXT:   000:   0 (*local*)
+# GNU-NOLINK-NEXT: warning: '[[FILE]]': invalid string table linked to SHT_GNU_verneed section with index 2: invalid sh_type for string table section [index 0]: expected SHT_STRTAB, but got SHT_NULL
+# GNU-NOLINK-NEXT:   2 (<corrupt>)
+# GNU-NOLINK-EMPTY:
+# GNU-NOLINK:      Version needs section '.gnu.version_r' contains 1 entries:
+# GNU-NOLINK-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 0 ()
+# GNU-NOLINK-NEXT:   0x0000: Version: 1  File: <corrupt vn_file: 9>  Cnt: 1
+# GNU-NOLINK-NEXT:   0x0010:   Name: <corrupt>  Flags: none Version: 2
+
+# LLVM-NOLINK:      VersionSymbols [
+# LLVM-NOLINK:        Symbol {
+# LLVM-NOLINK-NEXT:     Version: 0
+# LLVM-NOLINK-NEXT:     Name:
+# LLVM-NOLINK-NEXT:   }
+# LLVM-NOLINK-NEXT:   Symbol {
+# LLVM-NOLINK-EMPTY:
+# LLVM-NOLINK-NEXT:  warning: '[[FILE]]': invalid string table linked to SHT_GNU_verneed section with index 2: invalid sh_type for string table section [index 0]: expected SHT_STRTAB, but got SHT_NULL
+# LLVM-NOLINK-NEXT:     Version: 2
+# LLVM-NOLINK-NEXT:     Name: foo@<corrupt>
+# LLVM-NOLINK-NEXT:   }
+# LLVM-NOLINK-NEXT: ]
+
+# LLVM-NOLINK:      VersionRequirements [
+# LLVM-NOLINK-NEXT:   Dependency {
+# LLVM-NOLINK-NEXT:     Version: 1
+# LLVM-NOLINK-NEXT:     Count: 1
+# LLVM-NOLINK-NEXT:     FileName: <corrupt vn_file: 9>
+# LLVM-NOLINK-NEXT:     Entries [
+# LLVM-NOLINK-NEXT:       Entry {
+# LLVM-NOLINK-NEXT:         Hash: 0
+# LLVM-NOLINK-NEXT:         Flags [ (0x0)
+# LLVM-NOLINK-NEXT:         ]
+# LLVM-NOLINK-NEXT:         Index: 2
+# LLVM-NOLINK-NEXT:         Name: <corrupt>
+# LLVM-NOLINK-NEXT:       }
+# LLVM-NOLINK-NEXT:     ]
+# LLVM-NOLINK-NEXT:   }
+# LLVM-NOLINK-NEXT: ]
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+    Link:    .dynsym
+    Entries: [ 0, 2 ]
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Link:  0
+    Info:  1
+    AddressAlign: 4
+    Dependencies:
+      - Version: 1
+        File:    somefile
+        Entries:
+          - Name:  'bar'
+            Hash:  0
+            Flags: 0
+            Other: 2
+DynamicSymbols:
+  - Name:    foo
+    Binding: STB_GLOBAL
+
+## We can't parse misaligned auxiliary version records.
+
+# RUN: yaml2obj --docnum=3 %s -o %t3
+# RUN: not llvm-readelf -V %t3 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=BROKEN-AUX
+# RUN: not llvm-readobj -V %t3 2>&1 | FileCheck %s -DFILE=%t3 --check-prefix=BROKEN-AUX
+
+# BROKEN-AUX: error: '[[FILE]]': invalid SHT_GNU_verneed section with index 2: found a misaligned auxiliary entry at offset 0x11
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+    Link:    .dynsym
+    Entries: [ 2 ]
+  - Name:         .gnu.version_r
+    Type:         SHT_GNU_verneed
+    Flags:        [ SHF_ALLOC ]
+    Info:         1
+    Link:         .dynstr
+    AddressAlign: 4
+## The byte offset to the auxiliary entry is 0x11, i.e. it is not correctly aligned in memory.
+    Content: "0100010001000000110000000000000000000000"
+DynamicSymbols:
+  - Name: foo
+
+## Here we check that we can properly dump the case when a dependency file name
+## and/or a dependency name string offset is equal to the string table size.
+##
+## We set the version dependency vn_file field to the offset of string 'foo' in
+## the .dynstr, which is 1. We create a custom string table .mystrtab of size 1
+## and link it with the .gnu.version_r section. For the vna_name we use the same trick.
+
+# RUN: yaml2obj --docnum=4 %s -o %t4
+# RUN: llvm-readobj --sections --section-data -V %t4 | FileCheck %s --check-prefix=LLVM-OFFSET-EQ
+# RUN: llvm-readelf --sections -V %t4 | FileCheck %s --check-prefix=GNU-OFFSET-EQ
+
+# LLVM-OFFSET-EQ: Name: .mystrtab
+# LLVM-OFFSET-EQ: Size:
+# LLVM-OFFSET-EQ-SAME:  1
+
+# LLVM-OFFSET-EQ:      Name: .dynstr
+# LLVM-OFFSET-EQ:      SectionData (
+# LLVM-OFFSET-EQ-NEXT:   0000: 00666F6F 00 |.foo.|
+# LLVM-OFFSET-EQ-NEXT: )
+
+# LLVM-OFFSET-EQ:      VersionRequirements [
+# LLVM-OFFSET-EQ-NEXT:   Dependency {
+# LLVM-OFFSET-EQ-NEXT:     Version: 1
+# LLVM-OFFSET-EQ-NEXT:     Count: 1
+# LLVM-OFFSET-EQ-NEXT:     FileName: <corrupt vn_file: 1>
+# LLVM-OFFSET-EQ-NEXT:     Entries [
+# LLVM-OFFSET-EQ-NEXT:       Entry {
+# LLVM-OFFSET-EQ-NEXT:         Hash: 0
+# LLVM-OFFSET-EQ-NEXT:         Flags [ (0x0)
+# LLVM-OFFSET-EQ-NEXT:         ]
+# LLVM-OFFSET-EQ-NEXT:         Index: 0
+# LLVM-OFFSET-EQ-NEXT:         Name: <corrupt>
+# LLVM-OFFSET-EQ-NEXT:       }
+# LLVM-OFFSET-EQ-NEXT:     ]
+# LLVM-OFFSET-EQ-NEXT:   }
+# LLVM-OFFSET-EQ-NEXT: ]
+
+# GNU-OFFSET-EQ:      Version needs section '.gnu.version_r' contains 1 entries:
+# GNU-OFFSET-EQ-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 1 (.mystrtab)
+# GNU-OFFSET-EQ-NEXT:   0x0000: Version: 1  File: <corrupt vn_file: 1>  Cnt: 1
+# GNU-OFFSET-EQ-NEXT:   0x0010:   Name: <corrupt>  Flags: none  Version: 0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: .mystrtab
+    Type: SHT_STRTAB
+    Content: "00"
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .mystrtab
+    AddressAlign: 4
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Here we check that we can properly dump the case when a dependency file name
+## and/or a dependency name string offset is greater than the string table size.
+##
+# RUN: yaml2obj --docnum=5 %s -o %t5
+# RUN: llvm-readobj --sections -V %t5 | FileCheck %s --check-prefix=LLVM-OFFSET-GR
+# RUN: llvm-readelf --sections -V %t5 | FileCheck %s --check-prefix=GNU-OFFSET-GR
+
+# LLVM-OFFSET-GR:      VersionRequirements [
+# LLVM-OFFSET-GR-NEXT:   Dependency {
+# LLVM-OFFSET-GR-NEXT:     Version: 1
+# LLVM-OFFSET-GR-NEXT:     Count: 1
+# LLVM-OFFSET-GR-NEXT:     FileName: <corrupt vn_file: 1>
+# LLVM-OFFSET-GR-NEXT:     Entries [
+# LLVM-OFFSET-GR-NEXT:       Entry {
+# LLVM-OFFSET-GR-NEXT:         Hash: 0
+# LLVM-OFFSET-GR-NEXT:         Flags [ (0x0)
+# LLVM-OFFSET-GR-NEXT:         ]
+# LLVM-OFFSET-GR-NEXT:         Index: 0
+# LLVM-OFFSET-GR-NEXT:         Name: <corrupt>
+# LLVM-OFFSET-GR-NEXT:       }
+# LLVM-OFFSET-GR-NEXT:     ]
+# LLVM-OFFSET-GR-NEXT:   }
+# LLVM-OFFSET-GR-NEXT: ]
+
+# GNU-OFFSET-GR:      Version needs section '.gnu.version_r' contains 1 entries:
+# GNU-OFFSET-GR-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 1 (.mystrtab)
+# GNU-OFFSET-GR-NEXT:   0x0000: Version: 1  File: <corrupt vn_file: 1>  Cnt: 1
+# GNU-OFFSET-GR-NEXT:   0x0010:   Name: <corrupt>  Flags: none  Version: 0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name: .mystrtab
+    Type: SHT_STRTAB
+    Content: ""
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .mystrtab
+    AddressAlign: 4
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when sh_link references a non-existent section.
+
+# RUN: yaml2obj --docnum=6 %s -o %t6
+# RUN: llvm-readobj --sections -V %t6 2>&1 | FileCheck %s -DFILE=%t6 --implicit-check-not="warning:" --check-prefix=INVALID-LINK-LLVM
+# RUN: llvm-readelf --sections -V %t6 2>&1 | FileCheck %s -DFILE=%t6 --implicit-check-not="warning:" --check-prefix=INVALID-LINK-GNU
+
+# INVALID-LINK-LLVM:       VersionRequirements [
+# INVALID-LINK-LLVM-EMPTY:
+# INVALID-LINK-LLVM-NEXT:  warning: '[[FILE]]': invalid section linked to SHT_GNU_verneed section with index 1: invalid section index: 255
+# INVALID-LINK-LLVM-NEXT:   Dependency {
+# INVALID-LINK-LLVM-NEXT:     Version: 1
+# INVALID-LINK-LLVM-NEXT:     Count: 1
+# INVALID-LINK-LLVM-NEXT:     FileName: <corrupt vn_file: 1>
+# INVALID-LINK-LLVM-NEXT:     Entries [
+# INVALID-LINK-LLVM-NEXT:       Entry {
+# INVALID-LINK-LLVM-NEXT:         Hash: 0
+# INVALID-LINK-LLVM-NEXT:         Flags [ (0x0)
+# INVALID-LINK-LLVM-NEXT:         ]
+# INVALID-LINK-LLVM-NEXT:         Index: 0
+# INVALID-LINK-LLVM-NEXT:         Name: <corrupt>
+# INVALID-LINK-LLVM-NEXT:       }
+# INVALID-LINK-LLVM-NEXT:     ]
+# INVALID-LINK-LLVM-NEXT:   }
+# INVALID-LINK-LLVM-NEXT: ]
+
+# INVALID-LINK-GNU:       Version needs section '.gnu.version_r' contains 1 entries:
+# INVALID-LINK-GNU-EMPTY:
+# INVALID-LINK-GNU-NEXT:  warning: '[[FILE]]': invalid section linked to SHT_GNU_verneed section with index 1: invalid section index: 255
+# INVALID-LINK-GNU-NEXT:  Addr: 0000000000000000 Offset: 0x000040 Link: 255 (<corrupt>)
+# INVALID-LINK-GNU-NEXT:   0x0000: Version: 1 File: <corrupt vn_file: 1> Cnt: 1
+# INVALID-LINK-GNU-NEXT:   0x0010: Name: <corrupt> Flags: none Version: 0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  0xFF
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when we can't read the content of the SHT_GNU_verneed section.
+
+# RUN: yaml2obj --docnum=7 %s -o %t7
+# RUN: llvm-readobj --sections -V %t7 2>&1 | FileCheck %s -DFILE=%t7 --check-prefix=INVALID-DATA
+# RUN: llvm-readelf --sections -V %t7 2>&1 | FileCheck %s -DFILE=%t7 --check-prefix=INVALID-DATA
+
+# INVALID-DATA: warning: '[[FILE]]': cannot read content of SHT_GNU_verneed section with index 1: section [index 1] has a sh_offset (0xffffffff) + sh_size (0x0) that is greater than the file size (0x230)
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+    ShOffset: 0xFFFFFFFF
+## Triggers creation of the .dynstr.
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when a SHT_GNU_verneed section contains a version dependency
+## that goes past the end of the section.
+
+# RUN: yaml2obj --docnum=8 %s -o %t8
+# RUN: llvm-readobj --sections -V %t8 2>&1 | FileCheck %s -DFILE=%t8 --check-prefix=DEP-PAST-END
+# RUN: llvm-readelf --sections -V %t8 2>&1 | FileCheck %s -DFILE=%t8 --check-prefix=DEP-PAST-END
+
+# DEP-PAST-END: warning: '[[FILE]]': invalid SHT_GNU_verneed section with index 1: version dependency 1 goes past the end of the section
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+    ShSize: 0x1
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Check we report a warning when a version dependency is not correctly aligned in memory.
+
+# RUN: yaml2obj --docnum=9 %s -o %t9
+# RUN: llvm-readobj --sections -V %t9 2>&1 | FileCheck %s -DFILE=%t9 --check-prefix=MISALIGNED-DEP
+# RUN: llvm-readelf --sections -V %t9 2>&1 | FileCheck %s -DFILE=%t9 --check-prefix=MISALIGNED-DEP
+
+# MISALIGNED-DEP: warning: '[[FILE]]': invalid SHT_GNU_verneed section with index 1: found a misaligned version dependency entry at offset 0x0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Type: Fill
+    Size: 0x1
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Check that we report a warning when a SHT_GNU_verneed section contains a dependency definition
+## that refers to an auxiliary entry that goes past the end of the section.
+
+# RUN: yaml2obj --docnum=10 %s -o %t10
+# RUN: llvm-readobj --sections -V %t10 2>&1 | FileCheck %s -DFILE=%t10 --check-prefix=AUX-PAST-END
+# RUN: llvm-readelf --sections -V %t10 2>&1 | FileCheck %s -DFILE=%t10 --check-prefix=AUX-PAST-END
+
+# AUX-PAST-END: warning: '[[FILE]]': invalid SHT_GNU_verneed section with index 1: version dependency 1 refers to an auxiliary entry that goes past the end of the section
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+    ShSize: 21
+    Dependencies:
+      - Version: 1
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## Check we report a warning when an auxiliary entry is not correctly aligned in memory.
+
+# RUN: yaml2obj %s --docnum=11 -o %t11
+# RUN: llvm-readobj -V %t11 2>&1 | FileCheck %s --check-prefix=MISALIGNED-AUX -DFILE=%t11
+# RUN: llvm-readelf -V %t11 2>&1 | FileCheck %s --check-prefix=MISALIGNED-AUX -DFILE=%t11
+
+# MISALIGNED-AUX: warning: '[[FILE]]': invalid SHT_GNU_verneed section with index 1: found a misaligned auxiliary entry at offset 0x11
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+## The byte offset to the auxiliary entry is 0x11, i.e. it is not correctly aligned in memory.
+    Content: "0100010001000000110000000000000000000000"
+DynamicSymbols:
+  - Name: foo
+
+## Check how we handle the case when a dependency definition entry has an unsupported version.
+
+# RUN: yaml2obj %s --docnum=12 -o %t12
+# RUN: llvm-readobj -V %t12 2>&1 | FileCheck %s --check-prefix=UNSUPPORTED-VERSION -DFILE=%t12
+# RUN: llvm-readelf -V %t12 2>&1 | FileCheck %s --check-prefix=UNSUPPORTED-VERSION -DFILE=%t12
+
+# UNSUPPORTED-VERSION: warning: '[[FILE]]': unable to dump SHT_GNU_verneed section with index 1: version 65278 is not yet supported
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  1
+    Link:  .dynstr
+    Dependencies:
+      - Version: 0xfefe
+        File:    foo
+        Entries:
+          - Name:  'foo'
+            Hash:  0
+            Flags: 0
+            Other: 0
+DynamicSymbols:
+  - Name: foo
+
+## In this case SHT_GNU_verneed is linked to a custom dynamic string table, which is not
+## called ".dynstr". Check we handle this case properly.
+
+# RUN: yaml2obj --docnum=13 %s -o %t13
+# RUN: llvm-readelf -V %t13 2>&1 | FileCheck %s -DFILE=%t13 --check-prefix=GNU-CUSTOM-DYNSTR
+# RUN: llvm-readobj -V %t13 2>&1 | FileCheck %s -DFILE=%t13 --check-prefix=LLVM-CUSTOM-DYNSTR
+
+# GNU-CUSTOM-DYNSTR:      Version symbols section '.gnu.version' contains 2 entries:
+# GNU-CUSTOM-DYNSTR-NEXT:  Addr: 0000000000000000  Offset: 0x000040  Link: 6 (.dynsym)
+# GNU-CUSTOM-DYNSTR-NEXT:   000:   0 (*local*)       2 (bcdefghij)
+# GNU-CUSTOM-DYNSTR:      Version needs section '.gnu.version_r' contains 1 entries:
+# GNU-CUSTOM-DYNSTR-NEXT:  Addr: 0000000000000000  Offset: 0x000044  Link: 3 (.custom.dynstr)
+# GNU-CUSTOM-DYNSTR-NEXT:   0x0000: Version: 1  File: j  Cnt: 1
+# GNU-CUSTOM-DYNSTR-NEXT:   0x0010:   Name: bcdefghij  Flags: none  Version: 2
+
+# LLVM-CUSTOM-DYNSTR:      VersionSymbols [
+# LLVM-CUSTOM-DYNSTR:      Symbol {
+# LLVM-CUSTOM-DYNSTR:        Version: 2
+# LLVM-CUSTOM-DYNSTR-NEXT:   Name: foo@bcdefghij
+
+# LLVM-CUSTOM-DYNSTR:      VersionRequirements [
+# LLVM-CUSTOM-DYNSTR:        Dependency {
+# LLVM-CUSTOM-DYNSTR:          Entries [
+# LLVM-CUSTOM-DYNSTR:            Entry {
+# LLVM-CUSTOM-DYNSTR:              Index: 2
+# LLVM-CUSTOM-DYNSTR-NEXT:          Name: bcdefghij
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version
+    Type:    SHT_GNU_versym
+    Flags:   [ SHF_ALLOC ]
+    Link:    .dynsym
+    Entries: [ 0, 2 ]
+  - Name:         .gnu.version_r
+    Type:         SHT_GNU_verneed
+    Flags:        [ SHF_ALLOC ]
+    Link:         .custom.dynstr
+    Info:         1
+    AddressAlign: 4
+    Dependencies:
+      - Version: 1
+        File:    zed
+        Entries:
+          - Name:  'bar'
+            Hash:  0
+            Flags: 0
+            Other: 2
+  - Name: .custom.dynstr
+    Type: SHT_STRTAB
+    Content: "6162636465666768696a00" ## 'a','b','c','d','e','f','g','h','i','j',NIL
+DynamicSymbols:
+  - Name:    foo
+    Binding: STB_GLOBAL
diff --git a/llvm/test/tools/llvm-readobj/elf-versioninfo.test b/llvm/test/tools/llvm-readobj/elf-versioninfo.test
index 27bf302edf9c6..a7eaa80bb5df6 100644
--- a/llvm/test/tools/llvm-readobj/elf-versioninfo.test
+++ b/llvm/test/tools/llvm-readobj/elf-versioninfo.test
@@ -1,4 +1,6 @@
 ## Test how llvm-readobj/llvm-readelf tools dump versioning sections.
+## Check that SHT_GNU_versym dumper can see versions described in
+## SHT_GNU_verdef and SHT_GNU_verneed sections.
 
 # RUN: yaml2obj %s --docnum=1 -o %t1
 # RUN: llvm-readobj -V %t1 | FileCheck %s --check-prefix=LLVM
@@ -63,6 +65,7 @@ Sections:
         Names:
           - VERSION2
           - VERSION1
+          - VERSION3
   - Name:         .gnu.version_r
     Type:         SHT_GNU_verneed
     Flags:        [ SHF_ALLOC ]
@@ -147,6 +150,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 0
 # LLVM-NEXT:     Hash: 0
 # LLVM-NEXT:     Name: VERSION1
+# LLVM-NEXT:     Predecessors: []
 # LLVM-NEXT:   }
 # LLVM-NEXT:   Definition {
 # LLVM-NEXT:     Version: 1
@@ -156,6 +160,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 0
 # LLVM-NEXT:     Hash: 0
 # LLVM-NEXT:     Name: VERSION1
+# LLVM-NEXT:     Predecessors: []
 # LLVM-NEXT:   }
 # LLVM-NEXT:   Definition {
 # LLVM-NEXT:     Version: 1
@@ -165,6 +170,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 0
 # LLVM-NEXT:     Hash: 0
 # LLVM-NEXT:     Name: VERSION1
+# LLVM-NEXT:     Predecessors: []
 # LLVM-NEXT:   }
 # LLVM-NEXT:   Definition {
 # LLVM-NEXT:     Version: 1
@@ -174,6 +180,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 0
 # LLVM-NEXT:     Hash: 0
 # LLVM-NEXT:     Name: VERSION1
+# LLVM-NEXT:     Predecessors: []
 # LLVM-NEXT:   }
 # LLVM-NEXT:   Definition {
 # LLVM-NEXT:     Version: 1
@@ -185,6 +192,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 2
 # LLVM-NEXT:     Hash: 175630257
 # LLVM-NEXT:     Name: VERSION1
+# LLVM-NEXT:     Predecessors: []
 # LLVM-NEXT:   }
 # LLVM-NEXT:   Definition {
 # LLVM-NEXT:     Version: 1
@@ -193,7 +201,7 @@ DynamicSymbols:
 # LLVM-NEXT:     Index: 3
 # LLVM-NEXT:     Hash: 175630258
 # LLVM-NEXT:     Name: VERSION2
-# LLVM-NEXT:     Predecessor: VERSION1
+# LLVM-NEXT:     Predecessors: [VERSION1, VERSION3]
 # LLVM-NEXT:   }
 # LLVM-NEXT: ]
 # LLVM-NEXT: VersionRequirements [
@@ -273,11 +281,12 @@ DynamicSymbols:
 # GNU-NEXT:    0x0038: Rev: 1  Flags: WEAK  Index: 0  Cnt: 1  Name: VERSION1
 # GNU-NEXT:    0x0054: Rev: 1  Flags: INFO  Index: 0  Cnt: 1  Name: VERSION1
 # GNU-NEXT:    0x0070: Rev: 1  Flags: BASE | WEAK | INFO  Index: 2  Cnt: 1  Name: VERSION1
-# GNU-NEXT:    0x008c: Rev: 1  Flags: <unknown>  Index: 3  Cnt: 2  Name: VERSION2
-# GNU-NEXT:    0x00a8: Parent 1: VERSION1
+# GNU-NEXT:    0x008c: Rev: 1  Flags: <unknown>  Index: 3  Cnt: 3  Name: VERSION2
+# GNU-NEXT:    0x00b0: Parent 1: VERSION1
+# GNU-NEXT:    0x00b0: Parent 2: VERSION3
 # GNU-EMPTY:
 # GNU-NEXT:  Version needs section '.gnu.version_r' contains 2 entries:
-# GNU-NEXT:   Addr: 0000000000000000  Offset: 0x0000fc  Link: 7 (.dynstr)
+# GNU-NEXT:   Addr: 0000000000000000  Offset: 0x000104  Link: 7 (.dynstr)
 # GNU-NEXT:    0x0000: Version: 1  File: verneed1.so.0  Cnt: 5
 # GNU-NEXT:    0x0010:   Name: v1  Flags: BASE  Version: 0
 # GNU-NEXT:    0x0020:   Name: v1  Flags: WEAK  Version: 0
diff --git a/llvm/test/tools/llvm-readobj/many-sections.s b/llvm/test/tools/llvm-readobj/many-sections.s
index b2050c74eec24..ae7ce34706cad 100644
--- a/llvm/test/tools/llvm-readobj/many-sections.s
+++ b/llvm/test/tools/llvm-readobj/many-sections.s
@@ -1,37 +1,57 @@
-## many-sections.elf-x86_64 is a file that was generated to simulate
-## an object with more than ~65k sections. When an ELF object
-## has SHN_LORESERVE (0xff00) or more sections, its e_shnum field
+## Here we simulate an object with more than ~65k sections and check how we dump it.
+## When an ELF object has SHN_LORESERVE (0xff00) or more sections, its e_shnum field
 ## should be zero and sh_size of the section header at index 0 is used
 ## to store the value. If the section name string table section index is
 ## greater than or equal to SHN_LORESERVE, then e_shstrndx field
 ## should have the value of SHN_XINDEX and sh_link of the section header
 ## at index 0 is used to store the value.
-##
-## many-sections.elf-x86_64 has few sections to save disk
-## space, but its e_shnum, e_shstrndx, sh_size and sh_link fields are set
-## according to the above description, so that we can test the dumper.
-
-# RUN: llvm-readelf --file-headers -S %p/Inputs/many-sections.elf-x86_64 | \
-# RUN:   FileCheck %s --check-prefix=GNU1
-# GNU1: Number of section headers:         0 (5)
-# GNU1: Section header string table index: 65535 (3)
-# GNU1: There are 5 section headers, starting at offset 0xb8
-
-# RUN: llvm-readobj --file-headers %p/Inputs/many-sections.elf-x86_64 | \
-# RUN:   FileCheck %s --check-prefix=LLVM1
-# LLVM1: SectionHeaderCount: 0 (5)
-# LLVM1: StringTableSectionIndex: 65535 (3)
-
-## many-sections-stripped.elf-x86_64 is many-sections.elf-x86_64 with
-## e_shoff field set to zero, but not e_shstrndx, to show that
-## this corrupt case is handled correctly.
-
-# RUN: llvm-readelf --file-headers %p/Inputs/many-sections-stripped.elf-x86_64 | \
-# RUN:   FileCheck %s --check-prefix=GNU2
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readelf --file-headers -S %t1 | FileCheck %s --check-prefix=GNU1
+# GNU1: Number of section headers:         0 (3)
+# GNU1: Section header string table index: 65535 (2)
+# GNU1: There are 3 section headers, starting at offset 0x58
+
+# RUN: llvm-readobj --file-headers %t1 | FileCheck %s --check-prefix=LLVM1
+# LLVM1: SectionHeaderCount: 0 (3)
+# LLVM1: StringTableSectionIndex: 65535 (2)
+
+--- !ELF
+FileHeader:
+  Class:    ELFCLASS64
+  Data:     ELFDATA2LSB
+  Type:     ET_REL
+  Machine:  EM_X86_64
+  SHNum:    0
+  SHStrNdx: 0xffff ## SHN_XINDEX
+Sections:
+  - Type: SHT_NULL
+    Link: .shstrtab
+    Size: 0x3
+
+## Another case is when e_shoff field set to zero, but not e_shstrndx.
+## We want to show that this corrupt case is handled correctly.
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+
+# RUN: llvm-readelf --file-headers %t2 | FileCheck %s --check-prefix=GNU2
 # GNU2: Number of section headers:         0
 # GNU2: Section header string table index: 65535 (corrupt: out of range)
 
-# RUN: llvm-readobj --file-headers %p/Inputs/many-sections-stripped.elf-x86_64 | \
-# RUN:   FileCheck %s --check-prefix=LLVM2
+# RUN: llvm-readobj --file-headers %t2 | FileCheck %s --check-prefix=LLVM2
 # LLVM2: SectionHeaderCount: 0
 # LLVM2: StringTableSectionIndex: 65535 (corrupt: out of range)
+
+--- !ELF
+FileHeader:
+  Class:    ELFCLASS64
+  Data:     ELFDATA2LSB
+  Type:     ET_REL
+  Machine:  EM_X86_64
+  SHOff:    0
+  SHNum:    0
+  SHStrNdx: 0xffff ## SHN_XINDEX
+Sections:
+  - Type: SHT_NULL
+    Link: .shstrtab
+    Size: 0x3
diff --git a/llvm/test/tools/yaml2obj/ELF/dynsymtab-implicit-sections-size-content.yaml b/llvm/test/tools/yaml2obj/ELF/dynsymtab-implicit-sections-size-content.yaml
index bd3d162953492..3638fe67ab62b 100644
--- a/llvm/test/tools/yaml2obj/ELF/dynsymtab-implicit-sections-size-content.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/dynsymtab-implicit-sections-size-content.yaml
@@ -30,6 +30,7 @@ DynamicSymbols:
 ## Specifying both `Size` and symbols at the same time is not allowed for .dynsym.
 
 # RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --implicit-check-not=error --check-prefix=CASE2
+# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --implicit-check-not=error --check-prefix=CASE2
 
 # CASE2: yaml2obj: error: cannot specify both `Size` and `DynamicSymbols` for symbol table section '.dynsym'
 
@@ -50,9 +51,25 @@ DynamicSymbols:
   - Name:    foo
     Binding: STB_GLOBAL
 
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name: .dynsym
+    Type: SHT_DYNSYM
+    Size: 0x100
+  - Name: .dynsym2
+    Type: SHT_DYNSYM
+    Size: 0x100
+DynamicSymbols: []
+
 ## Specifying both `Content` and symbols at the same time is not allowed for .dynsym.
 
-# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --implicit-check-not=error --check-prefix=CASE3
+# RUN: not yaml2obj --docnum=4 %s 2>&1 | FileCheck %s --implicit-check-not=error --check-prefix=CASE3
+# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --implicit-check-not=error --check-prefix=CASE3
 
 # CASE3: yaml2obj: error: cannot specify both `Content` and `DynamicSymbols` for symbol table section '.dynsym'
 
@@ -63,18 +80,33 @@ FileHeader:
   Type:    ET_DYN
   Machine: EM_X86_64
 Sections:
-  - Name: .dynsym
-    Type: SHT_DYNSYM
+  - Name:    .dynsym
+    Type:    SHT_DYNSYM
     Content: "00"
-  - Name: .dynsym2
-    Type: SHT_DYNSYM
+  - Name:    .dynsym2
+    Type:    SHT_DYNSYM
     Content: "00"
 DynamicSymbols:
   - Name:    foo
     Binding: STB_GLOBAL
 
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .dynsym
+    Type:    SHT_DYNSYM
+    Content: "00"
+  - Name:    .dynsym2
+    Type:    SHT_DYNSYM
+    Content: "00"
+DynamicSymbols: []
+
 ## Check we can use just `Content` to emit custom data in the symbol table section.
-# RUN: yaml2obj --docnum=4 %s -o %t4
+# RUN: yaml2obj --docnum=6 %s -o %t4
 # RUN: llvm-readobj --section-data -S %t4 | FileCheck %s --check-prefix=CASE4
 
 # CASE4:      Name: .dynsym
@@ -106,7 +138,7 @@ Sections:
 
 ## Check we can use just `Size` to emit custom data filled with zeroes
 ## in the symbol table section.
-# RUN: yaml2obj --docnum=5 %s -o %t5
+# RUN: yaml2obj --docnum=7 %s -o %t5
 # RUN: llvm-readobj --section-data -S %t5 | FileCheck %s --check-prefix=CASE5
 
 # CASE5:      Name: .dynsym
@@ -140,7 +172,7 @@ Sections:
 ## than content size. In this case zeroes are added as padding
 ## after after the specified content.
 
-# RUN: yaml2obj --docnum=6 %s -o %t6
+# RUN: yaml2obj --docnum=8 %s -o %t6
 # RUN: llvm-readobj %t6 --section-data -S | FileCheck %s --check-prefix=CASE6
 
 # CASE6:      Name: .dynsym
@@ -174,7 +206,7 @@ Sections:
 ## Check we can specify both `Size` and `Content` when size is
 ## equal to content size.
 
-# RUN: yaml2obj --docnum=7 %s -o %t7
+# RUN: yaml2obj --docnum=9 %s -o %t7
 # RUN: llvm-readobj --section-data -S %t7 | FileCheck %s --check-prefix=CASE7
 
 # CASE7:       Name: .dynsym
diff --git a/llvm/test/tools/yaml2obj/ELF/gnu-hash-section.yaml b/llvm/test/tools/yaml2obj/ELF/gnu-hash-section.yaml
index fff9a341910c9..96fe0c53f8a5e 100644
--- a/llvm/test/tools/yaml2obj/ELF/gnu-hash-section.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/gnu-hash-section.yaml
@@ -32,9 +32,7 @@ Sections:
     Type:  SHT_GNU_HASH
     Content: "001122"
 ## Used to trigger .dynsym creation.
-DynamicSymbols:
-  - Name:    foo
-    Binding: STB_GLOBAL
+DynamicSymbols: []
 
 ## Check we can use "Header", "BloomFilter", "HashBuckets" and "HashValues" keys to describe
 ## the hash section. Check we can set sh_link to any arbitrary value. Check both ELFCLASS32 and 64 bit output.
diff --git a/llvm/test/tools/yaml2obj/ELF/implicit-sections-addr.yaml b/llvm/test/tools/yaml2obj/ELF/implicit-sections-addr.yaml
index 8151daed1d6e5..38f098ac44eac 100644
--- a/llvm/test/tools/yaml2obj/ELF/implicit-sections-addr.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/implicit-sections-addr.yaml
@@ -52,6 +52,4 @@ Sections:
   - Name:    .symtab
     Type:    SHT_SYMTAB
     Address: 0x4000
-DynamicSymbols:
-  - Name:    foo
-    Binding: STB_GLOBAL
+DynamicSymbols: []
diff --git a/llvm/test/tools/yaml2obj/ELF/implicit-sections-types.yaml b/llvm/test/tools/yaml2obj/ELF/implicit-sections-types.yaml
index 4d22dea62cc52..f6b103f9a6e62 100644
--- a/llvm/test/tools/yaml2obj/ELF/implicit-sections-types.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/implicit-sections-types.yaml
@@ -27,9 +27,7 @@ FileHeader:
 ## Needed to force the creation of the .symtab.
 Symbols: []
 ## Needed to force the creation of the .dynsym and .dynstr.
-DynamicSymbols:
-  - Name:    foo
-  - Binding: STB_GLOBAL
+DynamicSymbols: []
 
 ## Check we can set any arbitrary types when describing sections
 ## that are usually implicit.
diff --git a/llvm/test/tools/yaml2obj/ELF/implicit-sections.yaml b/llvm/test/tools/yaml2obj/ELF/implicit-sections.yaml
index cde17c23287d7..4d582e4707572 100644
--- a/llvm/test/tools/yaml2obj/ELF/implicit-sections.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/implicit-sections.yaml
@@ -11,12 +11,12 @@
 # CHECK:      Section Headers:
 # CHECK-NEXT:  [Nr] Name      Type     Address          Off    Size
 # CHECK-NEXT:  [ 0]           NULL     0000000000000000 000000 000000
-# CHECK-NEXT:  [ 1] .dynstr   STRTAB   0000000000000100 000040 000009
-# CHECK-NEXT:  [ 2] .dynsym   DYNSYM   0000000000000150 000049 000030
-# CHECK-NEXT:  [ 3] .symtab   SYMTAB   0000000000000000 000079 000018
-# CHECK-NEXT:  [ 4] .strtab   STRTAB   0000000000000000 000091 000001
-# CHECK-NEXT:  [ 5] .shstrtab STRTAB   0000000000000000 000092 000035
-# CHECK-NEXT:  [ 6] .text.foo PROGBITS 0000000000000200 0000c7 000000
+# CHECK-NEXT:  [ 1] .dynstr   STRTAB   0000000000000100 000040 000001
+# CHECK-NEXT:  [ 2] .dynsym   DYNSYM   0000000000000150 000041 000018
+# CHECK-NEXT:  [ 3] .symtab   SYMTAB   0000000000000000 000059 000018
+# CHECK-NEXT:  [ 4] .strtab   STRTAB   0000000000000000 000071 000001
+# CHECK-NEXT:  [ 5] .shstrtab STRTAB   0000000000000000 000072 000035
+# CHECK-NEXT:  [ 6] .text.foo PROGBITS 0000000000000200 0000a7 000000
 
 --- !ELF
 FileHeader:
@@ -40,10 +40,6 @@ Sections:
   - Name:    .text.foo
     Type:    SHT_PROGBITS
     Address: 0x200
-## Symbol is required for the .dynsym to be generated.
-DynamicSymbols:
-  - Name:    _Z3fooi
-    Binding: STB_GLOBAL
 
 ## Check that yaml2obj creates empty .dynstr and .dynsym sections for
 ## the case when no dynamic symbols were specified and Content wasn't set,
diff --git a/llvm/test/tools/yaml2obj/ELF/program-header.yaml b/llvm/test/tools/yaml2obj/ELF/program-header.yaml
index dbffafc465fa2..c66b71cc7905a 100644
--- a/llvm/test/tools/yaml2obj/ELF/program-header.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/program-header.yaml
@@ -40,6 +40,7 @@ ProgramHeaders:
   - Type: PT_GNU_EH_FRAME
   - Type: PT_GNU_STACK
   - Type: PT_GNU_RELRO
+  - Type: PT_GNU_PROPERTY
 
 #CHECK:     ProgramHeaders [
 #CHECK-NEXT:   ProgramHeader {
@@ -76,6 +77,9 @@ ProgramHeaders:
 #CHECK-NEXT:  ProgramHeader {
 #CHECK-NEXT:    Type: PT_GNU_RELRO (0x6474E552)
 #CHECK:       }
+#CHECK-NEXT:  ProgramHeader {
+#CHECK-NEXT:    Type: PT_GNU_PROPERTY (0x6474E553)
+#CHECK:       }
 #CHECK-NEXT:]
 
 ## Check we do not allow referencing sections that do not exist.
diff --git a/llvm/test/tools/yaml2obj/ELF/symtab-implicit-sections-size-content.yaml b/llvm/test/tools/yaml2obj/ELF/symtab-implicit-sections-size-content.yaml
index 81758bd88e211..d0f4514d0282b 100644
--- a/llvm/test/tools/yaml2obj/ELF/symtab-implicit-sections-size-content.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/symtab-implicit-sections-size-content.yaml
@@ -27,6 +27,7 @@ Symbols:
 
 ## Specifying both `Size` and symbols at the same time is not allowed.
 # RUN: not yaml2obj --docnum=2 %s -o %t2 2>&1 | FileCheck %s --check-prefix=CASE2
+# RUN: not yaml2obj --docnum=3 %s -o %t2 2>&1 | FileCheck %s --check-prefix=CASE2
 
 # CASE2: error: cannot specify both `Size` and `Symbols` for symbol table section '.symtab'
 
@@ -43,8 +44,21 @@ Sections:
 Symbols:
   - Name: foo
 
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name: .symtab
+    Type: SHT_SYMTAB
+    Size: 0x100
+Symbols: []
+
 ## Specifying both `Content` and symbols at the same time is not allowed.
-# RUN: not yaml2obj --docnum=3 %s -o %t3 2>&1 | FileCheck %s --check-prefix=CASE3
+# RUN: not yaml2obj --docnum=4 %s -o %t3 2>&1 | FileCheck %s --check-prefix=CASE3
+# RUN: not yaml2obj --docnum=5 %s -o %t3 2>&1 | FileCheck %s --check-prefix=CASE3
 
 # CASE3: error: cannot specify both `Content` and `Symbols` for symbol table section '.symtab'
 
@@ -55,14 +69,26 @@ FileHeader:
   Type:    ET_DYN
   Machine: EM_X86_64
 Sections:
-  - Name: .symtab
-    Type: SHT_SYMTAB
+  - Name:    .symtab
+    Type:    SHT_SYMTAB
     Content: "00"
 Symbols:
   - Name: foo
 
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .symtab
+    Type:    SHT_SYMTAB
+    Content: "00"
+Symbols: []
+
 ## Check we can use just `Content` to emit custom data in the symbol table section.
-# RUN: yaml2obj --docnum=4 %s -o %t4
+# RUN: yaml2obj --docnum=6 %s -o %t4
 # RUN: llvm-readobj --section-data -S %t4 | FileCheck %s --check-prefix=CASE4
 
 # CASE4:      Name: .symtab
@@ -93,7 +119,7 @@ Sections:
 
 ## Check we can use just `Size` to emit custom data filled with zeroes
 ## in the symbol table section.
-# RUN: yaml2obj --docnum=5 %s -o %t5
+# RUN: yaml2obj --docnum=7 %s -o %t5
 # RUN: llvm-readobj --section-data -S %t5 | FileCheck %s --check-prefix=CASE5
 
 # CASE5:      Name: .symtab (19)
@@ -126,7 +152,7 @@ Sections:
 ## than content size. In this case zeroes are added as padding
 ## after the specified content.
 
-# RUN: yaml2obj --docnum=6 %s -o %t6
+# RUN: yaml2obj --docnum=8 %s -o %t6
 # RUN: llvm-readobj %t6 --section-data -S | FileCheck %s --check-prefix=CASE6
 
 # CASE6:      Name: .symtab
@@ -159,7 +185,7 @@ Sections:
 ## Check we can specify both `Size` and `Content` when size is
 ## equal to content size.
 
-# RUN: yaml2obj --docnum=7 %s -o %t7
+# RUN: yaml2obj --docnum=9 %s -o %t7
 # RUN: llvm-readobj --section-data -S %t7 | FileCheck %s --check-prefix=CASE7
 
 # CASE7:       Name: .symtab
diff --git a/llvm/test/tools/yaml2obj/ELF/symtab-shinfo.yaml b/llvm/test/tools/yaml2obj/ELF/symtab-shinfo.yaml
deleted file mode 100644
index fadc19686bb5c..0000000000000
--- a/llvm/test/tools/yaml2obj/ELF/symtab-shinfo.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-## Check we are able to set sh_info field for SHT_SYMTAB sections.
-# RUN: yaml2obj %s -o %t
-# RUN: llvm-readobj --sections %t | FileCheck %s
-
-# CHECK:      Name: .symtab
-# CHECK-NEXT: Type: SHT_SYMTAB
-# CHECK-NEXT: Flags [
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address:
-# CHECK-NEXT: Offset:
-# CHECK-NEXT: Size:
-# CHECK-NEXT: Link:
-# CHECK-NEXT: Info: 42
-# CHECK:      Name: .dynsym
-# CHECK-NEXT: Type: SHT_DYNSYM
-# CHECK-NEXT: Flags [
-# CHECK-NEXT:   SHF_ALLOC
-# CHECK-NEXT: ]
-# CHECK-NEXT: Address:
-# CHECK-NEXT: Offset:
-# CHECK-NEXT: Size:
-# CHECK-NEXT: Link:
-# CHECK-NEXT: Info: 26
-
---- !ELF
-FileHeader:
-  Class:   ELFCLASS64
-  Data:    ELFDATA2LSB
-  Type:    ET_REL
-  Machine: EM_X86_64
-Sections:
-  - Name: .symtab
-    Info: 42
-    Type: SHT_SYMTAB
-  - Name: .dynsym
-    Info: 26
-    Type: SHT_DYNSYM
-Symbols:
-  - Name:    foo
-    Binding: STB_GLOBAL
-DynamicSymbols:
-  - Name:    bar
-    Binding: STB_GLOBAL
diff --git a/llvm/test/tools/yaml2obj/ELF/verdef-section.yaml b/llvm/test/tools/yaml2obj/ELF/verdef-section.yaml
index 439c428c19344..23bbc8ee791c0 100644
--- a/llvm/test/tools/yaml2obj/ELF/verdef-section.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/verdef-section.yaml
@@ -1,7 +1,7 @@
 ## Check we are able to handle SHT_GNU_verdef sections.
 
-# RUN: yaml2obj %s -o %t
-# RUN: llvm-readobj -V %t | FileCheck %s
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readobj -V %t1 | FileCheck %s
 
 # CHECK:      VersionDefinitions [
 # CHECK-NEXT:   Definition {
@@ -12,6 +12,7 @@
 # CHECK-NEXT:     Index: 1
 # CHECK-NEXT:     Hash: 170240160
 # CHECK-NEXT:     Name: dso.so.0
+# CHECK-NEXT:     Predecessors: []
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Definition {
 # CHECK-NEXT:     Version: 1
@@ -21,6 +22,7 @@
 # CHECK-NEXT:     Index: 2
 # CHECK-NEXT:     Hash: 108387921
 # CHECK-NEXT:     Name: VERSION_1
+# CHECK-NEXT:     Predecessors: []
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Definition {
 # CHECK-NEXT:     Version: 1
@@ -31,7 +33,7 @@
 # CHECK-NEXT:     Index: 3
 # CHECK-NEXT:     Hash: 108387922
 # CHECK-NEXT:     Name: VERSION_2
-# CHECK-NEXT:     Predecessor: VERSION_3
+# CHECK-NEXT:     Predecessors: [VERSION_3]
 # CHECK-NEXT:   }
 # CHECK-NEXT: ]
 
@@ -73,4 +75,83 @@ Sections:
 DynamicSymbols:
   - Name:    foo
     Binding: STB_GLOBAL
-...
+
+## Check we can use "Content" to describe the content.
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-readobj --sections --section-data %t2 | FileCheck %s --check-prefix=CONTENT
+
+# CONTENT:      Name: .gnu.version_d
+# CONTENT-NEXT: Type: SHT_GNU_verdef
+# CONTENT-NEXT: Flags [ (0x2)
+# CONTENT-NEXT:   SHF_ALLOC (0x2)
+# CONTENT-NEXT: ]
+# CONTENT-NEXT: Address: 0x0
+# CONTENT-NEXT: Offset: 0x40
+# CONTENT-NEXT: Size: 3
+# CONTENT-NEXT: Link: 0
+# CONTENT-NEXT: Info: 1
+# CONTENT-NEXT: AddressAlignment:
+# CONTENT-NEXT: EntrySize:
+# CONTENT-NEXT: SectionData (
+# CONTENT-NEXT:   0000: 112233
+# CONTENT-NEXT: )
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Flags:   [ SHF_ALLOC ]
+    Info:    0x0000000000000001
+    Content: "112233"
+
+## Check we can omit "Content" and "Entries" fields to produce an empty SHT_GNU_verdef section.
+
+# RUN: yaml2obj --docnum=3 %s -o %t3
+# RUN: llvm-readelf --sections %t3 | FileCheck %s --check-prefix=NO-PROPS
+
+# NO-PROPS: [Nr] Name           Type   Address          Off    Size   ES Flg Lk Inf Al
+# NO-PROPS: [ 1] .gnu.version_d VERDEF 0000000000000000 000040 000000 00   A 0   1  0
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_d
+    Type:  SHT_GNU_verdef
+    Flags: [ SHF_ALLOC ]
+    Info:  0x0000000000000001
+
+## Check we can't use both "Entries" and "Content" together.
+
+# RUN: not yaml2obj --docnum=4 %s -o %t4 2>&1 | FileCheck %s --check-prefix=BOTH
+
+# BOTH: error: SHT_GNU_verdef: "Entries" and "Content" can't be used together
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_DYN
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_d
+    Type:    SHT_GNU_verdef
+    Flags:   [ SHF_ALLOC ]
+    Info:    0x0000000000000001
+    Content: "112233"
+    Entries:
+      - Version:    0
+        Flags:      0
+        VersionNdx: 0
+        Hash:       0
+        Names:
+          - foo
diff --git a/llvm/test/tools/yaml2obj/ELF/verneed-section.yaml b/llvm/test/tools/yaml2obj/ELF/verneed-section.yaml
index 1a1dc34985a27..59e4a0e5f8d85 100644
--- a/llvm/test/tools/yaml2obj/ELF/verneed-section.yaml
+++ b/llvm/test/tools/yaml2obj/ELF/verneed-section.yaml
@@ -1,7 +1,7 @@
 ## Check we are able to handle SHT_GNU_verneed sections.
 
-# RUN: yaml2obj %s -o %t
-# RUN: llvm-readobj -V %t | FileCheck %s
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-readobj -V %t1 | FileCheck %s
 
 # CHECK:      VersionRequirements [
 # CHECK-NEXT:   Dependency {
@@ -82,3 +82,77 @@ Sections:
 DynamicSymbols:
   - Name:    f1
     Binding: STB_GLOBAL
+
+## Check we can use "Content" to describe the content.
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-readobj --sections --section-data %t2 | FileCheck %s --check-prefix=CONTENT
+
+# CONTENT:      Name: .gnu.version_r
+# CONTENT-NEXT: Type: SHT_GNU_verneed
+# CONTENT-NEXT: Flags [ (0x2)
+# CONTENT-NEXT:   SHF_ALLOC (0x2)
+# CONTENT-NEXT: ]
+# CONTENT-NEXT: Address: 0x0
+# CONTENT-NEXT: Offset: 0x40
+# CONTENT-NEXT: Size: 3
+# CONTENT-NEXT: Link: 0
+# CONTENT-NEXT: Info: 1
+# CONTENT-NEXT: AddressAlignment: 0
+# CONTENT-NEXT: EntrySize: 0
+# CONTENT-NEXT: SectionData (
+# CONTENT-NEXT:   0000: 112233
+# CONTENT-NEXT: )
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .gnu.version_r
+    Type:    SHT_GNU_verneed
+    Flags:   [ SHF_ALLOC ]
+    Info:    0x1
+    Content: "112233"
+
+## Check we can omit "Content" and "Dependencies" fields to produce an empty SHT_GNU_verneed section.
+
+# RUN: yaml2obj --docnum=3 %s -o %t3
+# RUN: llvm-readelf --sections %t3 | FileCheck %s --check-prefix=NO-PROPS
+
+# NO-PROPS: [Nr] Name           Type    Address          Off    Size
+# NO-PROPS: [ 1] .gnu.version_r VERNEED 0000000000000000 000040 000000
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:  .gnu.version_r
+    Type:  SHT_GNU_verneed
+    Flags: [ SHF_ALLOC ]
+    Info:  0x0
+
+## Check we can't use both "Dependencies" and "Content" together.
+
+# RUN: not yaml2obj --docnum=4 %s 2>&1 | FileCheck %s --check-prefix=BOTH
+
+# BOTH: error: SHT_GNU_verneed: "Dependencies" and "Content" can't be used together
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:         .gnu.version_r
+    Type:         SHT_GNU_verneed
+    Flags:        [ SHF_ALLOC ]
+    Info:         0x0
+    Content:      ""
+    Dependencies: []
diff --git a/llvm/tools/dsymutil/DwarfLinker.cpp b/llvm/tools/dsymutil/DwarfLinker.cpp
index 624c343272eca..80ac237f1c4ec 100644
--- a/llvm/tools/dsymutil/DwarfLinker.cpp
+++ b/llvm/tools/dsymutil/DwarfLinker.cpp
@@ -760,43 +760,143 @@ unsigned DwarfLinker::shouldKeepDIE(RelocationManager &RelocMgr,
   return Flags;
 }
 
-/// Mark the passed DIE as well as all the ones it depends on
-/// as kept.
-///
-/// This function is called by lookForDIEsToKeep on DIEs that are
-/// newly discovered to be needed in the link. It recursively calls
-/// back to lookForDIEsToKeep while adding TF_DependencyWalk to the
-/// TraversalFlags to inform it that it's not doing the primary DIE
-/// tree walk.
-void DwarfLinker::keepDIEAndDependencies(
-    RelocationManager &RelocMgr, RangesTy &Ranges, const UnitListTy &Units,
-    const DWARFDie &Die, CompileUnit::DIEInfo &MyInfo,
-    const DebugMapObject &DMO, CompileUnit &CU, bool UseODR) {
-  DWARFUnit &Unit = CU.getOrigUnit();
-  MyInfo.Keep = true;
+namespace {
+/// The  distinct types of work performed by the work loop.
+enum class WorklistItemType {
+  /// Given a DIE, look for DIEs to be kept.
+  LookForDIEsToKeep,
+  /// Given a DIE, look for children of this DIE to be kept.
+  LookForChildDIEsToKeep,
+  /// Given a DIE, look for DIEs referencing this DIE to be kept.
+  LookForRefDIEsToKeep,
+  /// Given a DIE, look for parent DIEs to be kept.
+  LookForParentDIEsToKeep,
+  /// Given a DIE, update its incompleteness based on whether its children are
+  /// incomplete.
+  UpdateChildIncompleteness,
+  /// Given a DIE, update its incompleteness based on whether the DIEs it
+  /// references are incomplete.
+  UpdateRefIncompleteness,
+};
 
-  // We're looking for incomplete types.
-  MyInfo.Incomplete = Die.getTag() != dwarf::DW_TAG_subprogram &&
-                      Die.getTag() != dwarf::DW_TAG_member &&
-                      dwarf::toUnsigned(Die.find(dwarf::DW_AT_declaration), 0);
+/// This class represents an item in the work list. The type defines what kind
+/// of work needs to be performed when processing the current item. The flags
+/// and info fields are optional based on the type.
+struct WorklistItem {
+  WorklistItemType Type;
+  DWARFDie Die;
+  CompileUnit &CU;
+  unsigned Flags;
+  unsigned AncestorIdx = 0;
+  CompileUnit::DIEInfo *OtherInfo = nullptr;
+
+  WorklistItem(DWARFDie Die, CompileUnit &CU, unsigned Flags,
+               WorklistItemType T = WorklistItemType::LookForDIEsToKeep)
+      : Type(T), Die(Die), CU(CU), Flags(Flags){};
+
+  WorklistItem(DWARFDie Die, CompileUnit &CU, WorklistItemType T,
+               CompileUnit::DIEInfo *OtherInfo = nullptr)
+      : Type(T), Die(Die), CU(CU), OtherInfo(OtherInfo){};
+
+  WorklistItem(unsigned AncestorIdx, CompileUnit &CU, unsigned Flags)
+      : Type(WorklistItemType::LookForParentDIEsToKeep), CU(CU), Flags(Flags),
+        AncestorIdx(AncestorIdx){};
+};
+} // namespace
 
-  // First mark all the parent chain as kept.
-  unsigned AncestorIdx = MyInfo.ParentIdx;
-  while (!CU.getInfo(AncestorIdx).Keep) {
-    unsigned ODRFlag = UseODR ? TF_ODR : 0;
-    lookForDIEsToKeep(RelocMgr, Ranges, Units, Unit.getDIEAtIndex(AncestorIdx),
-                      DMO, CU,
-                      TF_ParentWalk | TF_Keep | TF_DependencyWalk | ODRFlag);
-    AncestorIdx = CU.getInfo(AncestorIdx).ParentIdx;
+/// Helper that updates the completeness of the current DIE based on the
+/// completeness of one of its children. It depends on the incompleteness of
+/// the children already being computed.
+static void updateChildIncompleteness(const DWARFDie &Die, CompileUnit &CU,
+                                      CompileUnit::DIEInfo &ChildInfo) {
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_class_type:
+    break;
+  default:
+    return;
   }
 
-  // Then we need to mark all the DIEs referenced by this DIE's
-  // attributes as kept.
+  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+
+  if (ChildInfo.Incomplete || ChildInfo.Prune)
+    MyInfo.Incomplete = true;
+}
+
+/// Helper that updates the completeness of the current DIE based on the
+/// completeness of the DIEs it references. It depends on the incompleteness of
+/// the referenced DIE already being computed.
+static void updateRefIncompleteness(const DWARFDie &Die, CompileUnit &CU,
+                                    CompileUnit::DIEInfo &RefInfo) {
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_member:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_ptr_to_member_type:
+  case dwarf::DW_TAG_pointer_type:
+    break;
+  default:
+    return;
+  }
+
+  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+
+  if (MyInfo.Incomplete)
+    return;
+
+  if (RefInfo.Incomplete)
+    MyInfo.Incomplete = true;
+}
+
+/// Look at the children of the given DIE and decide whether they should be
+/// kept.
+static void lookForChildDIEsToKeep(const DWARFDie &Die, CompileUnit &CU,
+                                   unsigned Flags,
+                                   SmallVectorImpl<WorklistItem> &Worklist) {
+  // The TF_ParentWalk flag tells us that we are currently walking up the
+  // parent chain of a required DIE, and we don't want to mark all the children
+  // of the parents as kept (consider for example a DW_TAG_namespace node in
+  // the parent chain). There are however a set of DIE types for which we want
+  // to ignore that directive and still walk their children.
+  if (dieNeedsChildrenToBeMeaningful(Die.getTag()))
+    Flags &= ~DwarfLinker::TF_ParentWalk;
+
+  // We're finished if this DIE has no children or we're walking the parent
+  // chain.
+  if (!Die.hasChildren() || (Flags & DwarfLinker::TF_ParentWalk))
+    return;
+
+  // Add children in reverse order to the worklist to effectively process them
+  // in order.
+  for (auto Child : reverse(Die.children())) {
+    // Add a worklist item before every child to calculate incompleteness right
+    // after the current child is processed.
+    unsigned Idx = CU.getOrigUnit().getDIEIndex(Child);
+    CompileUnit::DIEInfo &ChildInfo = CU.getInfo(Idx);
+    Worklist.emplace_back(Die, CU, WorklistItemType::UpdateChildIncompleteness,
+                          &ChildInfo);
+    Worklist.emplace_back(Child, CU, Flags);
+  }
+}
+
+/// Look at DIEs referenced by the given DIE and decide whether they should be
+/// kept. All DIEs referenced though attributes should be kept.
+static void lookForRefDIEsToKeep(const DWARFDie &Die, CompileUnit &CU,
+                                 unsigned Flags, DwarfLinker &Linker,
+                                 const UnitListTy &Units,
+                                 const DebugMapObject &DMO,
+                                 SmallVectorImpl<WorklistItem> &Worklist) {
+  bool UseOdr = (Flags & DwarfLinker::TF_DependencyWalk)
+                    ? (Flags & DwarfLinker::TF_ODR)
+                    : CU.hasODR();
+  DWARFUnit &Unit = CU.getOrigUnit();
   DWARFDataExtractor Data = Unit.getDebugInfoExtractor();
   const auto *Abbrev = Die.getAbbreviationDeclarationPtr();
   uint64_t Offset = Die.getOffset() + getULEB128Size(Abbrev->getCode());
 
-  // Mark all DIEs referenced through attributes as kept.
+  SmallVector<std::pair<DWARFDie, CompileUnit &>, 4> ReferencedDIEs;
   for (const auto &AttrSpec : Abbrev->attributes()) {
     DWARFFormValue Val(AttrSpec.Form);
     if (!Val.isFormClass(DWARFFormValue::FC_Reference) ||
@@ -809,7 +909,7 @@ void DwarfLinker::keepDIEAndDependencies(
     Val.extractValue(Data, &Offset, Unit.getFormParams(), &Unit);
     CompileUnit *ReferencedCU;
     if (auto RefDie =
-            resolveDIEReference(*this, DMO, Units, Val, Die, ReferencedCU)) {
+            resolveDIEReference(Linker, DMO, Units, Val, Die, ReferencedCU)) {
       uint32_t RefIdx = ReferencedCU->getOrigUnit().getDIEIndex(RefDie);
       CompileUnit::DIEInfo &Info = ReferencedCU->getInfo(RefIdx);
       bool IsModuleRef = Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset() &&
@@ -817,12 +917,14 @@ void DwarfLinker::keepDIEAndDependencies(
       // If the referenced DIE has a DeclContext that has already been
       // emitted, then do not keep the one in this CU. We'll link to
       // the canonical DIE in cloneDieReferenceAttribute.
+      //
       // FIXME: compatibility with dsymutil-classic. UseODR shouldn't
       // be necessary and could be advantageously replaced by
       // ReferencedCU->hasODR() && CU.hasODR().
+      //
       // FIXME: compatibility with dsymutil-classic. There is no
       // reason not to unique ref_addr references.
-      if (AttrSpec.Form != dwarf::DW_FORM_ref_addr && (UseODR || IsModuleRef) &&
+      if (AttrSpec.Form != dwarf::DW_FORM_ref_addr && (UseOdr || IsModuleRef) &&
           Info.Ctxt &&
           Info.Ctxt != ReferencedCU->getInfo(Info.ParentIdx).Ctxt &&
           Info.Ctxt->getCanonicalDIEOffset() && isODRAttribute(AttrSpec.Attr))
@@ -832,107 +934,104 @@ void DwarfLinker::keepDIEAndDependencies(
       if (!(isODRAttribute(AttrSpec.Attr) && Info.Ctxt &&
             Info.Ctxt->getCanonicalDIEOffset()))
         Info.Prune = false;
-
-      unsigned ODRFlag = UseODR ? TF_ODR : 0;
-      lookForDIEsToKeep(RelocMgr, Ranges, Units, RefDie, DMO, *ReferencedCU,
-                        TF_Keep | TF_DependencyWalk | ODRFlag);
-
-      // The incomplete property is propagated if the current DIE is complete
-      // but references an incomplete DIE.
-      if (Info.Incomplete && !MyInfo.Incomplete &&
-          (Die.getTag() == dwarf::DW_TAG_typedef ||
-           Die.getTag() == dwarf::DW_TAG_member ||
-           Die.getTag() == dwarf::DW_TAG_reference_type ||
-           Die.getTag() == dwarf::DW_TAG_ptr_to_member_type ||
-           Die.getTag() == dwarf::DW_TAG_pointer_type))
-        MyInfo.Incomplete = true;
+      ReferencedDIEs.emplace_back(RefDie, *ReferencedCU);
     }
   }
-}
 
-namespace {
-/// This class represents an item in the work list. In addition to it's obvious
-/// purpose of representing the state associated with a particular run of the
-/// work loop, it also serves as a marker to indicate that we should run the
-/// "continuation" code.
-///
-/// Originally, the latter was lambda which allowed arbitrary code to be run.
-/// Because we always need to run the exact same code, it made more sense to
-/// use a boolean and repurpose the already existing DIE field.
-struct WorklistItem {
-  DWARFDie Die;
-  unsigned Flags;
-  bool IsContinuation;
-  CompileUnit::DIEInfo *ChildInfo = nullptr;
-
-  /// Construct a classic worklist item.
-  WorklistItem(DWARFDie Die, unsigned Flags)
-      : Die(Die), Flags(Flags), IsContinuation(false){};
-
-  /// Creates a continuation marker.
-  WorklistItem(DWARFDie Die) : Die(Die), IsContinuation(true){};
-};
-} // namespace
+  unsigned ODRFlag = UseOdr ? DwarfLinker::TF_ODR : 0;
 
-// Helper that updates the completeness of the current DIE. It depends on the
-// fact that the incompletness of its children is already computed.
-static void updateIncompleteness(const DWARFDie &Die,
-                                 CompileUnit::DIEInfo &ChildInfo,
-                                 CompileUnit &CU) {
-  // Only propagate incomplete members.
-  if (Die.getTag() != dwarf::DW_TAG_structure_type &&
-      Die.getTag() != dwarf::DW_TAG_class_type)
-    return;
-
-  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
-  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+  // Add referenced DIEs in reverse order to the worklist to effectively
+  // process them in order.
+  for (auto &P : reverse(ReferencedDIEs)) {
+    // Add a worklist item before every child to calculate incompleteness right
+    // after the current child is processed.
+    uint32_t RefIdx = P.second.getOrigUnit().getDIEIndex(P.first);
+    CompileUnit::DIEInfo &Info = P.second.getInfo(RefIdx);
+    Worklist.emplace_back(Die, CU, WorklistItemType::UpdateRefIncompleteness,
+                          &Info);
+    Worklist.emplace_back(P.first, P.second,
+                          DwarfLinker::TF_Keep |
+                              DwarfLinker::TF_DependencyWalk | ODRFlag);
+  }
+}
 
-  if (MyInfo.Incomplete)
+/// Look at the parent of the given DIE and decide whether they should be kept.
+static void lookForParentDIEsToKeep(unsigned AncestorIdx, CompileUnit &CU,
+                                    unsigned Flags,
+                                    SmallVectorImpl<WorklistItem> &Worklist) {
+  // Stop if we encounter an ancestor that's already marked as kept.
+  if (CU.getInfo(AncestorIdx).Keep)
     return;
 
-  if (ChildInfo.Incomplete || ChildInfo.Prune)
-    MyInfo.Incomplete = true;
+  DWARFUnit &Unit = CU.getOrigUnit();
+  DWARFDie ParentDIE = Unit.getDIEAtIndex(AncestorIdx);
+  Worklist.emplace_back(CU.getInfo(AncestorIdx).ParentIdx, CU, Flags);
+  Worklist.emplace_back(ParentDIE, CU, Flags);
 }
 
-/// Recursively walk the \p DIE tree and look for DIEs to
-/// keep. Store that information in \p CU's DIEInfo.
+/// Recursively walk the \p DIE tree and look for DIEs to keep. Store that
+/// information in \p CU's DIEInfo.
+///
+/// This function is the entry point of the DIE selection algorithm. It is
+/// expected to walk the DIE tree in file order and (though the mediation of
+/// its helper) call hasValidRelocation() on each DIE that might be a 'root
+/// DIE' (See DwarfLinker class comment).
 ///
-/// This function is the entry point of the DIE selection
-/// algorithm. It is expected to walk the DIE tree in file order and
-/// (though the mediation of its helper) call hasValidRelocation() on
-/// each DIE that might be a 'root DIE' (See DwarfLinker class
-/// comment).
-/// While walking the dependencies of root DIEs, this function is
-/// also called, but during these dependency walks the file order is
-/// not respected. The TF_DependencyWalk flag tells us which kind of
-/// traversal we are currently doing.
+/// While walking the dependencies of root DIEs, this function is also called,
+/// but during these dependency walks the file order is not respected. The
+/// TF_DependencyWalk flag tells us which kind of traversal we are currently
+/// doing.
+///
+/// The recursive algorithm is implemented iteratively as a work list because
+/// very deep recursion could exhaust the stack for large projects. The work
+/// list acts as a scheduler for different types of work that need to be
+/// performed.
+///
+/// The recursive nature of the algorithm is simulated by running the "main"
+/// algorithm (LookForDIEsToKeep) followed by either looking at more DIEs
+/// (LookForChildDIEsToKeep, LookForRefDIEsToKeep, LookForParentDIEsToKeep) or
+/// fixing up a computed property (UpdateChildIncompleteness,
+/// UpdateRefIncompleteness).
 ///
 /// The return value indicates whether the DIE is incomplete.
 void DwarfLinker::lookForDIEsToKeep(RelocationManager &RelocMgr,
                                     RangesTy &Ranges, const UnitListTy &Units,
                                     const DWARFDie &Die,
-                                    const DebugMapObject &DMO, CompileUnit &CU,
+                                    const DebugMapObject &DMO, CompileUnit &Cu,
                                     unsigned Flags) {
   // LIFO work list.
   SmallVector<WorklistItem, 4> Worklist;
-  Worklist.emplace_back(Die, Flags);
+  Worklist.emplace_back(Die, Cu, Flags);
 
   while (!Worklist.empty()) {
     WorklistItem Current = Worklist.back();
     Worklist.pop_back();
 
-    if (Current.IsContinuation) {
-      updateIncompleteness(Current.Die, *Current.ChildInfo, CU);
+    // Look at the worklist type to decide what kind of work to perform.
+    switch (Current.Type) {
+    case WorklistItemType::UpdateChildIncompleteness:
+      updateChildIncompleteness(Current.Die, Current.CU, *Current.OtherInfo);
+      continue;
+    case WorklistItemType::UpdateRefIncompleteness:
+      updateRefIncompleteness(Current.Die, Current.CU, *Current.OtherInfo);
+      continue;
+    case WorklistItemType::LookForChildDIEsToKeep:
+      lookForChildDIEsToKeep(Current.Die, Current.CU, Current.Flags, Worklist);
+      continue;
+    case WorklistItemType::LookForRefDIEsToKeep:
+      lookForRefDIEsToKeep(Current.Die, Current.CU, Current.Flags, *this, Units,
+                           DMO, Worklist);
       continue;
+    case WorklistItemType::LookForParentDIEsToKeep:
+      lookForParentDIEsToKeep(Current.AncestorIdx, Current.CU, Current.Flags,
+                              Worklist);
+      continue;
+    case WorklistItemType::LookForDIEsToKeep:
+      break;
     }
 
-    unsigned Idx = CU.getOrigUnit().getDIEIndex(Current.Die);
-    CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
-
-    // At this point we are guaranteed to have a continuation marker before us
-    // in the worklist, except for the last DIE.
-    if (!Worklist.empty())
-      Worklist.back().ChildInfo = &MyInfo;
+    unsigned Idx = Current.CU.getOrigUnit().getDIEIndex(Current.Die);
+    CompileUnit::DIEInfo &MyInfo = Current.CU.getInfo(Idx);
 
     if (MyInfo.Prune)
       continue;
@@ -946,41 +1045,41 @@ void DwarfLinker::lookForDIEsToKeep(RelocationManager &RelocMgr,
     // We must not call shouldKeepDIE while called from keepDIEAndDependencies,
     // because it would screw up the relocation finding logic.
     if (!(Current.Flags & TF_DependencyWalk))
-      Current.Flags = shouldKeepDIE(RelocMgr, Ranges, Current.Die, DMO, CU,
-                                    MyInfo, Current.Flags);
-
-    // If it is a newly kept DIE mark it as well as all its dependencies as
-    // kept.
-    if (!AlreadyKept && (Current.Flags & TF_Keep)) {
-      bool UseOdr = (Current.Flags & TF_DependencyWalk)
-                        ? (Current.Flags & TF_ODR)
-                        : CU.hasODR();
-      keepDIEAndDependencies(RelocMgr, Ranges, Units, Current.Die, MyInfo, DMO,
-                             CU, UseOdr);
-    }
+      Current.Flags = shouldKeepDIE(RelocMgr, Ranges, Current.Die, DMO,
+                                    Current.CU, MyInfo, Current.Flags);
 
-    // The TF_ParentWalk flag tells us that we are currently walking up
-    // the parent chain of a required DIE, and we don't want to mark all
-    // the children of the parents as kept (consider for example a
-    // DW_TAG_namespace node in the parent chain). There are however a
-    // set of DIE types for which we want to ignore that directive and still
-    // walk their children.
-    if (dieNeedsChildrenToBeMeaningful(Current.Die.getTag()))
-      Current.Flags &= ~TF_ParentWalk;
+    // Finish by looking for child DIEs. Because of the LIFO worklist we need
+    // to schedule that work before any subsequent items are added to the
+    // worklist.
+    Worklist.emplace_back(Current.Die, Current.CU, Current.Flags,
+                          WorklistItemType::LookForChildDIEsToKeep);
 
-    if (!Current.Die.hasChildren() || (Current.Flags & TF_ParentWalk))
+    if (AlreadyKept || !(Current.Flags & TF_Keep))
       continue;
 
-    // Add children in reverse order to the worklist to effectively process
-    // them in order.
-    for (auto Child : reverse(Current.Die.children())) {
-      // Add continuation marker before every child to calculate incompleteness
-      // after the last child is processed. We can't store this information in
-      // the same item because we might have to process other continuations
-      // first.
-      Worklist.emplace_back(Current.Die);
-      Worklist.emplace_back(Child, Current.Flags);
-    }
+    // If it is a newly kept DIE mark it as well as all its dependencies as
+    // kept.
+    MyInfo.Keep = true;
+
+    // We're looking for incomplete types.
+    MyInfo.Incomplete =
+        Current.Die.getTag() != dwarf::DW_TAG_subprogram &&
+        Current.Die.getTag() != dwarf::DW_TAG_member &&
+        dwarf::toUnsigned(Current.Die.find(dwarf::DW_AT_declaration), 0);
+
+    // After looking at the parent chain, look for referenced DIEs. Because of
+    // the LIFO worklist we need to schedule that work before any subsequent
+    // items are added to the worklist.
+    Worklist.emplace_back(Current.Die, Current.CU, Current.Flags,
+                          WorklistItemType::LookForRefDIEsToKeep);
+
+    bool UseOdr = (Current.Flags & TF_DependencyWalk) ? (Current.Flags & TF_ODR)
+                                                      : Current.CU.hasODR();
+    unsigned ODRFlag = UseOdr ? TF_ODR : 0;
+    unsigned ParFlags = TF_ParentWalk | TF_Keep | TF_DependencyWalk | ODRFlag;
+
+    // Now schedule the parent walk.
+    Worklist.emplace_back(MyInfo.ParentIdx, Current.CU, ParFlags);
   }
 }
 
diff --git a/llvm/tools/dsymutil/DwarfLinker.h b/llvm/tools/dsymutil/DwarfLinker.h
index 729b625b22b01..b8d8e9d02e321 100644
--- a/llvm/tools/dsymutil/DwarfLinker.h
+++ b/llvm/tools/dsymutil/DwarfLinker.h
@@ -65,6 +65,15 @@ class DwarfLinker {
   void reportWarning(const Twine &Warning, const DebugMapObject &DMO,
                      const DWARFDie *DIE = nullptr) const;
 
+  /// Flags passed to DwarfLinker::lookForDIEsToKeep
+  enum TraversalFlags {
+    TF_Keep = 1 << 0,            ///< Mark the traversed DIEs as kept.
+    TF_InFunctionScope = 1 << 1, ///< Current scope is a function scope.
+    TF_DependencyWalk = 1 << 2,  ///< Walking the dependencies of a kept DIE.
+    TF_ParentWalk = 1 << 3,      ///< Walking up the parents of a kept DIE.
+    TF_ODR = 1 << 4,             ///< Use the ODR while keeping dependents.
+    TF_SkipPC = 1 << 5,          ///< Skip all location attributes.
+  };
 private:
   /// Remembers the oldest and newest DWARF version we've seen in a unit.
   void updateDwarfVersion(unsigned Version) {
@@ -215,15 +224,6 @@ class DwarfLinker {
                         unsigned &UnitID, bool IsLittleEndian,
                         unsigned Indent = 0, bool Quiet = false);
 
-  /// Flags passed to DwarfLinker::lookForDIEsToKeep
-  enum TraversalFlags {
-    TF_Keep = 1 << 0,            ///< Mark the traversed DIEs as kept.
-    TF_InFunctionScope = 1 << 1, ///< Current scope is a function scope.
-    TF_DependencyWalk = 1 << 2,  ///< Walking the dependencies of a kept DIE.
-    TF_ParentWalk = 1 << 3,      ///< Walking up the parents of a kept DIE.
-    TF_ODR = 1 << 4,             ///< Use the ODR while keeping dependents.
-    TF_SkipPC = 1 << 5,          ///< Skip all location attributes.
-  };
 
   /// Mark the passed DIE as well as all the ones it depends on as kept.
   void keepDIEAndDependencies(RelocationManager &RelocMgr, RangesTy &Ranges,
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index 8eaa365af1580..eb86c2f3ae1b3 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -97,7 +97,7 @@ def output: Separate<["--", "-"], "o">,
   MetaVarName<"<filename>">,
   HelpText<"Specify the output file. Defaults to <input file>.dwarf">,
   Group<grp_general>;
-def: Separate<["-"], "out">,
+def: Separate<["--", "-"], "out">,
   Alias<output>,
   HelpText<"Alias for --o">,
   Group<grp_general>;
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index ccad067214141..9aaeef0859295 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -871,16 +871,11 @@ int runOrcLazyJIT(const char *ProgName) {
 
   // Run main.
   auto MainSym = ExitOnErr(J->lookup("main"));
-  typedef int (*MainFnPtr)(int, const char *[]);
-  std::vector<const char *> ArgV;
-  for (auto &Arg : Args)
-    ArgV.push_back(Arg.c_str());
-  ArgV.push_back(nullptr);
-
-  int ArgC = ArgV.size() - 1;
-  auto Main =
-      reinterpret_cast<MainFnPtr>(static_cast<uintptr_t>(MainSym.getAddress()));
-  auto Result = Main(ArgC, (const char **)ArgV.data());
+
+  typedef int (*MainFnPtr)(int, char *[]);
+  auto Result = orc::runAsMain(
+      jitTargetAddressToFunction<MainFnPtr>(MainSym.getAddress()), Args,
+      StringRef("lli"));
 
   // Wait for -entry-point threads.
   for (auto &AltEntryThread : AltEntryThreads)
diff --git a/llvm/tools/llvm-c-test/debuginfo.c b/llvm/tools/llvm-c-test/debuginfo.c
index e498de6a745ad..b5080f01193ba 100644
--- a/llvm/tools/llvm-c-test/debuginfo.c
+++ b/llvm/tools/llvm-c-test/debuginfo.c
@@ -69,7 +69,7 @@ int llvm_test_dibuilder(void) {
   LLVMMetadataRef Int64Ty =
       LLVMDIBuilderCreateBasicType(DIB, "Int64", 5, 64, 0, LLVMDIFlagZero);
   LLVMMetadataRef Int64TypeDef =
-    LLVMDIBuilderCreateTypedef(DIB, Int64Ty, "int64_t", 7, File, 42, File);
+      LLVMDIBuilderCreateTypedef(DIB, Int64Ty, "int64_t", 7, File, 42, File, 0);
 
   LLVMMetadataRef GlobalVarValueExpr =
       LLVMDIBuilderCreateConstantValueExpression(DIB, 0);
diff --git a/llvm/tools/llvm-config/CMakeLists.txt b/llvm/tools/llvm-config/CMakeLists.txt
index 9ab1d283bb800..16ba54c0cf2fc 100644
--- a/llvm/tools/llvm-config/CMakeLists.txt
+++ b/llvm/tools/llvm-config/CMakeLists.txt
@@ -20,7 +20,11 @@ foreach(l ${SUPPORT_SYSTEM_LIBS} ${WINDOWSMANIFEST_SYSTEM_LIBS})
       set(SYSTEM_LIBS ${SYSTEM_LIBS} "${l}")
     else()
       # Otherwise assume it's a library name we need to link with.
-      set(SYSTEM_LIBS ${SYSTEM_LIBS} "-l${l}")
+      if(IS_ABSOLUTE ${l})
+        set(SYSTEM_LIBS ${SYSTEM_LIBS} "${l}")
+      else()
+        set(SYSTEM_LIBS ${SYSTEM_LIBS} "-l${l}")
+      endif()
     endif()
   endif()
 endforeach()
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 6cf3d465dd9a6..61da38e5f5dd4 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -439,6 +439,9 @@ struct ConstantInliner {
 
   std::vector<MCInst> popFlagAndFinalize();
 
+  std::vector<MCInst> loadImplicitRegAndFinalize(unsigned Opcode,
+                                                 unsigned Value);
+
 private:
   ConstantInliner &add(const MCInst &Inst) {
     Instructions.push_back(Inst);
@@ -499,6 +502,21 @@ std::vector<MCInst> ConstantInliner::popFlagAndFinalize() {
   return std::move(Instructions);
 }
 
+std::vector<MCInst>
+ConstantInliner::loadImplicitRegAndFinalize(unsigned Opcode, unsigned Value) {
+  add(allocateStackSpace(4));
+  add(fillStackSpace(X86::MOV32mi, 0, Value)); // Mask all FP exceptions
+  add(MCInstBuilder(Opcode)
+          // Address = ESP
+          .addReg(X86::RSP) // BaseReg
+          .addImm(1)        // ScaleAmt
+          .addReg(0)        // IndexReg
+          .addImm(0)        // Disp
+          .addReg(0));      // Segment
+  add(releaseStackSpace(4));
+  return std::move(Instructions);
+}
+
 void ConstantInliner::initStack(unsigned Bytes) {
   assert(Constant_.getBitWidth() <= Bytes * 8 &&
          "Value does not have the correct size");
@@ -699,6 +717,12 @@ std::vector<MCInst> ExegesisX86Target::setRegTo(const MCSubtargetInfo &STI,
   }
   if (Reg == X86::EFLAGS)
     return CI.popFlagAndFinalize();
+  if (Reg == X86::MXCSR)
+    return CI.loadImplicitRegAndFinalize(
+              STI.getFeatureBits()[X86::FeatureAVX] ? X86::VLDMXCSR
+                                                    : X86::LDMXCSR, 0x1f80);
+  if (Reg == X86::FPCW)
+    return CI.loadImplicitRegAndFinalize(X86::FLDCW16m, 0x37f);
   return {}; // Not yet implemented.
 }
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 251e79cf56d12..f5b841811cb0d 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -56,6 +56,10 @@ static cl::list<std::string>
     CheckFiles("check", cl::desc("File containing verifier checks"),
                cl::ZeroOrMore);
 
+static cl::opt<std::string>
+    CheckName("check-name", cl::desc("Name of checks to match against"),
+              cl::init("jitlink-check"));
+
 static cl::opt<std::string>
     EntryPointName("entry", cl::desc("Symbol to call as main entry point"),
                    cl::init(""));
@@ -393,7 +397,8 @@ static std::unique_ptr<jitlink::JITLinkMemoryManager> createMemoryManager() {
 }
 
 Session::Session(Triple TT)
-    : MemMgr(createMemoryManager()), ObjLayer(ES, *MemMgr), TT(std::move(TT)) {
+    : MainJD(ES.createJITDylib("<main>")), MemMgr(createMemoryManager()),
+      ObjLayer(ES, *MemMgr), TT(std::move(TT)) {
 
   /// Local ObjectLinkingLayer::Plugin class to forward modifyPassConfig to the
   /// Session.
@@ -556,7 +561,7 @@ Error loadProcessSymbols(Session &S) {
   auto FilterMainEntryPoint = [InternedEntryPointName](SymbolStringPtr Name) {
     return Name != InternedEntryPointName;
   };
-  S.ES.getMainJITDylib().addGenerator(
+  S.MainJD.addGenerator(
       ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
           GlobalPrefix, FilterMainEntryPoint)));
 
@@ -585,10 +590,9 @@ Error loadObjects(Session &S) {
   LLVM_DEBUG(dbgs() << "Creating JITDylibs...\n");
   {
     // Create a "main" JITLinkDylib.
-    auto &MainJD = S.ES.getMainJITDylib();
-    IdxToJLD[0] = &MainJD;
-    S.JDSearchOrder.push_back(&MainJD);
-    LLVM_DEBUG(dbgs() << "  0: " << MainJD.getName() << "\n");
+    IdxToJLD[0] = &S.MainJD;
+    S.JDSearchOrder.push_back(&S.MainJD);
+    LLVM_DEBUG(dbgs() << "  0: " << S.MainJD.getName() << "\n");
 
     // Add any extra JITLinkDylibs from the command line.
     std::string JDNamePrefix("lib");
@@ -604,11 +608,12 @@ Error loadObjects(Session &S) {
 
     // Set every dylib to link against every other, in command line order.
     for (auto *JD : S.JDSearchOrder) {
-      JITDylibSearchList O;
+      auto LookupFlags = JITDylibLookupFlags::MatchExportedSymbolsOnly;
+      JITDylibSearchOrder O;
       for (auto *JD2 : S.JDSearchOrder) {
         if (JD2 == JD)
           continue;
-        O.push_back(std::make_pair(JD2, false));
+        O.push_back(std::make_pair(JD2, LookupFlags));
       }
       JD->setSearchOrder(std::move(O));
     }
@@ -741,10 +746,11 @@ Error runChecks(Session &S) {
       S.TT.isLittleEndian() ? support::little : support::big,
       Disassembler.get(), InstPrinter.get(), dbgs());
 
+  std::string CheckLineStart = "# " + CheckName + ":";
   for (auto &CheckFile : CheckFiles) {
     auto CheckerFileBuf =
         ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(CheckFile)));
-    if (!Checker.checkAllRulesInBuffer("# jitlink-check:", &*CheckerFileBuf))
+    if (!Checker.checkAllRulesInBuffer(CheckLineStart, &*CheckerFileBuf))
       ExitOnErr(make_error<StringError>(
           "Some checks in " + CheckFile + " failed", inconvertibleErrorCode()));
   }
@@ -763,25 +769,6 @@ static Expected<JITEvaluatedSymbol> getMainEntryPoint(Session &S) {
   return S.ES.lookup(S.JDSearchOrder, EntryPointName);
 }
 
-Expected<int> runEntryPoint(Session &S, JITEvaluatedSymbol EntryPoint) {
-  assert(EntryPoint.getAddress() && "Entry point address should not be null");
-
-  constexpr const char *JITProgramName = "<llvm-jitlink jit'd code>";
-  auto PNStorage = std::make_unique<char[]>(strlen(JITProgramName) + 1);
-  strcpy(PNStorage.get(), JITProgramName);
-
-  std::vector<const char *> EntryPointArgs;
-  EntryPointArgs.push_back(PNStorage.get());
-  for (auto &InputArg : InputArgv)
-    EntryPointArgs.push_back(InputArg.data());
-  EntryPointArgs.push_back(nullptr);
-
-  using MainTy = int (*)(int, const char *[]);
-  MainTy EntryPointPtr = reinterpret_cast<MainTy>(EntryPoint.getAddress());
-
-  return EntryPointPtr(EntryPointArgs.size() - 1, EntryPointArgs.data());
-}
-
 struct JITLinkTimers {
   TimerGroup JITLinkTG{"llvm-jitlink timers", "timers for llvm-jitlink phases"};
   Timer LoadObjectsTimer{"load", "time to load/add object files", JITLinkTG};
@@ -835,8 +822,10 @@ int main(int argc, char *argv[]) {
 
   int Result = 0;
   {
+    using MainTy = int (*)(int, char *[]);
+    auto EntryFn = jitTargetAddressToFunction<MainTy>(EntryPoint.getAddress());
     TimeRegion TR(Timers ? &Timers->RunTimer : nullptr);
-    Result = ExitOnErr(runEntryPoint(S, EntryPoint));
+    Result = runAsMain(EntryFn, InputArgv, StringRef(InputFiles.front()));
   }
 
   return Result;
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.h b/llvm/tools/llvm-jitlink/llvm-jitlink.h
index f94a50993c122..005f7f211e985 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.h
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 struct Session {
   orc::ExecutionSession ES;
+  orc::JITDylib &MainJD;
   std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr;
   orc::ObjectLinkingLayer ObjLayer;
   std::vector<orc::JITDylib *> JDSearchOrder;
diff --git a/llvm/tools/llvm-objcopy/COFF/Object.h b/llvm/tools/llvm-objcopy/COFF/Object.h
index a6a3901e9c8d8..78f8da00b8cd4 100644
--- a/llvm/tools/llvm-objcopy/COFF/Object.h
+++ b/llvm/tools/llvm-objcopy/COFF/Object.h
@@ -124,7 +124,7 @@ struct Object {
 
   ArrayRef<Section> getSections() const { return Sections; }
   // This allows mutating individual Sections, but not mutating the list
-  // of symbols itself.
+  // of sections itself.
   iterator_range<std::vector<Section>::iterator> getMutableSections() {
     return make_range(Sections.begin(), Sections.end());
   }
diff --git a/llvm/tools/llvm-objdump/ELFDump.cpp b/llvm/tools/llvm-objdump/ELFDump.cpp
index 7ef16c1876383..abfe08346bbd7 100644
--- a/llvm/tools/llvm-objdump/ELFDump.cpp
+++ b/llvm/tools/llvm-objdump/ELFDump.cpp
@@ -204,6 +204,9 @@ template <class ELFT> void printProgramHeaders(const ELFFile<ELFT> *o) {
     case ELF::PT_GNU_RELRO:
       outs() << "   RELRO ";
       break;
+    case ELF::PT_GNU_PROPERTY:
+      outs() << "   PROPERTY ";
+      break;
     case ELF::PT_GNU_STACK:
       outs() << "   STACK ";
       break;
diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp
index f6975bdb45e8f..b95fbcb90469c 100644
--- a/llvm/tools/llvm-readobj/ELFDumper.cpp
+++ b/llvm/tools/llvm-readobj/ELFDumper.cpp
@@ -151,6 +151,41 @@ struct DynRegionInfo {
   }
 };
 
+namespace {
+struct VerdAux {
+  unsigned Offset;
+  std::string Name;
+};
+
+struct VerDef {
+  unsigned Offset;
+  unsigned Version;
+  unsigned Flags;
+  unsigned Ndx;
+  unsigned Cnt;
+  unsigned Hash;
+  std::string Name;
+  std::vector<VerdAux> AuxV;
+};
+
+struct VernAux {
+  unsigned Hash;
+  unsigned Flags;
+  unsigned Other;
+  unsigned Offset;
+  std::string Name;
+};
+
+struct VerNeed {
+  unsigned Version;
+  unsigned Cnt;
+  unsigned Offset;
+  std::string File;
+  std::vector<VernAux> AuxV;
+};
+
+} // namespace
+
 template <typename ELFT> class ELFDumper : public ObjDumper {
 public:
   ELFDumper(const object::ELFObjectFile<ELFT> *ObjF, ScopedPrinter &Writer);
@@ -223,11 +258,8 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
   void loadDynamicTable(const ELFFile<ELFT> *Obj);
   void parseDynamicTable();
 
-  StringRef getSymbolVersion(StringRef StrTab, const Elf_Sym *symb,
-                             bool &IsDefault) const;
+  StringRef getSymbolVersion(const Elf_Sym *symb, bool &IsDefault) const;
   void LoadVersionMap() const;
-  void LoadVersionNeeds(const Elf_Shdr *ec) const;
-  void LoadVersionDefs(const Elf_Shdr *sec) const;
 
   const object::ELFObjectFile<ELFT> *ObjF;
   DynRegionInfo DynRelRegion;
@@ -250,29 +282,11 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
   const Elf_Shdr *SymbolVersionNeedSection = nullptr; // .gnu.version_r
   const Elf_Shdr *SymbolVersionDefSection = nullptr; // .gnu.version_d
 
-  // Records for each version index the corresponding Verdef or Vernaux entry.
-  // This is filled the first time LoadVersionMap() is called.
-  class VersionMapEntry : public PointerIntPair<const void *, 1> {
-  public:
-    // If the integer is 0, this is an Elf_Verdef*.
-    // If the integer is 1, this is an Elf_Vernaux*.
-    VersionMapEntry() : PointerIntPair<const void *, 1>(nullptr, 0) {}
-    VersionMapEntry(const Elf_Verdef *verdef)
-        : PointerIntPair<const void *, 1>(verdef, 0) {}
-    VersionMapEntry(const Elf_Vernaux *vernaux)
-        : PointerIntPair<const void *, 1>(vernaux, 1) {}
-
-    bool isNull() const { return getPointer() == nullptr; }
-    bool isVerdef() const { return !isNull() && getInt() == 0; }
-    bool isVernaux() const { return !isNull() && getInt() == 1; }
-    const Elf_Verdef *getVerdef() const {
-      return isVerdef() ? (const Elf_Verdef *)getPointer() : nullptr;
-    }
-    const Elf_Vernaux *getVernaux() const {
-      return isVernaux() ? (const Elf_Vernaux *)getPointer() : nullptr;
-    }
+  struct VersionEntry {
+    std::string Name;
+    bool IsVerDef;
   };
-  mutable SmallVector<VersionMapEntry, 16> VersionMap;
+  mutable SmallVector<Optional<VersionEntry>, 16> VersionMap;
 
 public:
   Elf_Dyn_Range dynamic_table() const {
@@ -305,8 +319,7 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
                                            unsigned SectionIndex) const;
   Expected<std::string> getStaticSymbolName(uint32_t Index) const;
   std::string getDynamicString(uint64_t Value) const;
-  StringRef getSymbolVersionByIndex(StringRef StrTab,
-                                    uint32_t VersionSymbolIndex,
+  StringRef getSymbolVersionByIndex(uint32_t VersionSymbolIndex,
                                     bool &IsDefault) const;
 
   void printSymbolsHelper(bool IsDynamic) const;
@@ -324,8 +337,219 @@ template <typename ELFT> class ELFDumper : public ObjDumper {
   const DynRegionInfo &getDynamicTableRegion() const { return DynamicTable; }
   const Elf_Hash *getHashTable() const { return HashTable; }
   const Elf_GnuHash *getGnuHashTable() const { return GnuHashTable; }
+
+  Expected<std::vector<VerDef>>
+  getVersionDefinitions(const Elf_Shdr *Sec) const;
+  Expected<std::vector<VerNeed>>
+  getVersionDependencies(const Elf_Shdr *Sec) const;
 };
 
+template <class ELFT>
+static Expected<StringRef> getLinkAsStrtab(const ELFFile<ELFT> *Obj,
+                                           const typename ELFT::Shdr *Sec,
+                                           unsigned SecNdx) {
+  Expected<const typename ELFT::Shdr *> StrTabSecOrErr =
+      Obj->getSection(Sec->sh_link);
+  if (!StrTabSecOrErr)
+    return createError("invalid section linked to " +
+                       object::getELFSectionTypeName(
+                           Obj->getHeader()->e_machine, Sec->sh_type) +
+                       " section with index " + Twine(SecNdx) + ": " +
+                       toString(StrTabSecOrErr.takeError()));
+
+  Expected<StringRef> StrTabOrErr = Obj->getStringTable(*StrTabSecOrErr);
+  if (!StrTabOrErr)
+    return createError("invalid string table linked to " +
+                       object::getELFSectionTypeName(
+                           Obj->getHeader()->e_machine, Sec->sh_type) +
+                       " section with index " + Twine(SecNdx) + ": " +
+                       toString(StrTabOrErr.takeError()));
+  return *StrTabOrErr;
+}
+
+template <class ELFT>
+Expected<std::vector<VerDef>>
+ELFDumper<ELFT>::getVersionDefinitions(const Elf_Shdr *Sec) const {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
+
+  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Obj, Sec, SecNdx);
+  if (!StrTabOrErr)
+    return StrTabOrErr.takeError();
+
+  Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(Sec);
+  if (!ContentsOrErr)
+    return createError(
+        "cannot read content of SHT_GNU_verdef section with index " +
+        Twine(SecNdx) + ": " + toString(ContentsOrErr.takeError()));
+
+  const uint8_t *Start = ContentsOrErr->data();
+  const uint8_t *End = Start + ContentsOrErr->size();
+
+  auto ExtractNextAux = [&](const uint8_t *&VerdauxBuf,
+                            unsigned VerDefNdx) -> Expected<VerdAux> {
+    if (VerdauxBuf + sizeof(Elf_Verdaux) > End)
+      return createError("invalid SHT_GNU_verdef section with index " +
+                         Twine(SecNdx) + ": version definition " +
+                         Twine(VerDefNdx) +
+                         " refers to an auxiliary entry that goes past the end "
+                         "of the section");
+
+    auto *Verdaux = reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
+    VerdauxBuf += Verdaux->vda_next;
+
+    VerdAux Aux;
+    Aux.Offset = VerdauxBuf - Start;
+    if (Verdaux->vda_name <= StrTabOrErr->size())
+      Aux.Name = StrTabOrErr->drop_front(Verdaux->vda_name);
+    else
+      Aux.Name = "<invalid vda_name: " + to_string(Verdaux->vda_name) + ">";
+    return Aux;
+  };
+
+  std::vector<VerDef> Ret;
+  const uint8_t *VerdefBuf = Start;
+  for (unsigned I = 1; I <= /*VerDefsNum=*/Sec->sh_info; ++I) {
+    if (VerdefBuf + sizeof(Elf_Verdef) > End)
+      return createError("invalid SHT_GNU_verdef section with index " +
+                         Twine(SecNdx) + ": version definition " + Twine(I) +
+                         " goes past the end of the section");
+
+    if (uintptr_t(VerdefBuf) % sizeof(uint32_t) != 0)
+      return createError(
+          "invalid SHT_GNU_verdef section with index " + Twine(SecNdx) +
+          ": found a misaligned version definition entry at offset 0x" +
+          Twine::utohexstr(VerdefBuf - Start));
+
+    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerdefBuf);
+    if (Version != 1)
+      return createError("unable to dump SHT_GNU_verdef section with index " +
+                         Twine(SecNdx) + ": version " + Twine(Version) +
+                         " is not yet supported");
+
+    const Elf_Verdef *D = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
+    VerDef &VD = *Ret.emplace(Ret.end());
+    VD.Offset = VerdefBuf - Start;
+    VD.Version = D->vd_version;
+    VD.Flags = D->vd_flags;
+    VD.Ndx = D->vd_ndx;
+    VD.Cnt = D->vd_cnt;
+    VD.Hash = D->vd_hash;
+
+    const uint8_t *VerdauxBuf = VerdefBuf + D->vd_aux;
+    for (unsigned J = 0; J < D->vd_cnt; ++J) {
+      if (uintptr_t(VerdauxBuf) % sizeof(uint32_t) != 0)
+        return createError("invalid SHT_GNU_verdef section with index " +
+                           Twine(SecNdx) +
+                           ": found a misaligned auxiliary entry at offset 0x" +
+                           Twine::utohexstr(VerdauxBuf - Start));
+
+      Expected<VerdAux> AuxOrErr = ExtractNextAux(VerdauxBuf, I);
+      if (!AuxOrErr)
+        return AuxOrErr.takeError();
+
+      if (J == 0)
+        VD.Name = AuxOrErr->Name;
+      else
+        VD.AuxV.push_back(*AuxOrErr);
+    }
+
+    VerdefBuf += D->vd_next;
+  }
+
+  return Ret;
+}
+
+template <class ELFT>
+Expected<std::vector<VerNeed>>
+ELFDumper<ELFT>::getVersionDependencies(const Elf_Shdr *Sec) const {
+  const ELFFile<ELFT> *Obj = ObjF->getELFFile();
+  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
+
+  StringRef StrTab;
+  Expected<StringRef> StrTabOrErr = getLinkAsStrtab(Obj, Sec, SecNdx);
+  if (!StrTabOrErr)
+    ELFDumperStyle->reportUniqueWarning(StrTabOrErr.takeError());
+  else
+    StrTab = *StrTabOrErr;
+
+  Expected<ArrayRef<uint8_t>> ContentsOrErr = Obj->getSectionContents(Sec);
+  if (!ContentsOrErr)
+    return createError(
+        "cannot read content of SHT_GNU_verneed section with index " +
+        Twine(SecNdx) + ": " + toString(ContentsOrErr.takeError()));
+
+  const uint8_t *Start = ContentsOrErr->data();
+  const uint8_t *End = Start + ContentsOrErr->size();
+  const uint8_t *VerneedBuf = Start;
+
+  std::vector<VerNeed> Ret;
+  for (unsigned I = 1; I <= /*VerneedNum=*/Sec->sh_info; ++I) {
+    if (VerneedBuf + sizeof(Elf_Verdef) > End)
+      return createError("invalid SHT_GNU_verneed section with index " +
+                         Twine(SecNdx) + ": version dependency " + Twine(I) +
+                         " goes past the end of the section");
+
+    if (uintptr_t(VerneedBuf) % sizeof(uint32_t) != 0)
+      return createError(
+          "invalid SHT_GNU_verneed section with index " + Twine(SecNdx) +
+          ": found a misaligned version dependency entry at offset 0x" +
+          Twine::utohexstr(VerneedBuf - Start));
+
+    unsigned Version = *reinterpret_cast<const Elf_Half *>(VerneedBuf);
+    if (Version != 1)
+      return createError("unable to dump SHT_GNU_verneed section with index " +
+                         Twine(SecNdx) + ": version " + Twine(Version) +
+                         " is not yet supported");
+
+    const Elf_Verneed *Verneed =
+        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
+
+    VerNeed &VN = *Ret.emplace(Ret.end());
+    VN.Version = Verneed->vn_version;
+    VN.Cnt = Verneed->vn_cnt;
+    VN.Offset = VerneedBuf - Start;
+
+    if (Verneed->vn_file < StrTab.size())
+      VN.File = StrTab.drop_front(Verneed->vn_file);
+    else
+      VN.File = "<corrupt vn_file: " + to_string(Verneed->vn_file) + ">";
+
+    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
+    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
+      if (uintptr_t(VernauxBuf) % sizeof(uint32_t) != 0)
+        return createError("invalid SHT_GNU_verneed section with index " +
+                           Twine(SecNdx) +
+                           ": found a misaligned auxiliary entry at offset 0x" +
+                           Twine::utohexstr(VernauxBuf - Start));
+
+      if (VernauxBuf + sizeof(Elf_Vernaux) > End)
+        return createError(
+            "invalid SHT_GNU_verneed section with index " + Twine(SecNdx) +
+            ": version dependency " + Twine(I) +
+            " refers to an auxiliary entry that goes past the end "
+            "of the section");
+
+      const Elf_Vernaux *Vernaux =
+          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
+
+      VernAux &Aux = *VN.AuxV.emplace(VN.AuxV.end());
+      Aux.Hash = Vernaux->vna_hash;
+      Aux.Flags = Vernaux->vna_flags;
+      Aux.Other = Vernaux->vna_other;
+      Aux.Offset = VernauxBuf - Start;
+      if (StrTab.size() <= Vernaux->vna_name)
+        Aux.Name = "<corrupt>";
+      else
+        Aux.Name = StrTab.drop_front(Vernaux->vna_name);
+
+      VernauxBuf += Vernaux->vna_next;
+    }
+    VerneedBuf += Verneed->vn_next;
+  }
+  return Ret;
+}
+
 template <class ELFT>
 void ELFDumper<ELFT>::printSymbolsHelper(bool IsDynamic) const {
   StringRef StrTable, SymtabName;
@@ -556,6 +780,9 @@ template <typename ELFT> class GNUStyle : public DumpStyle<ELFT> {
   bool checkPTDynamic(const Elf_Phdr &Phdr, const Elf_Shdr &Sec);
   void printProgramHeaders(const ELFO *Obj);
   void printSectionMapping(const ELFO *Obj);
+  void printGNUVersionSectionProlog(const ELFFile<ELFT> *Obj,
+                                    const typename ELFT::Shdr *Sec,
+                                    const Twine &Label, unsigned EntriesNum);
 };
 
 template <class ELFT>
@@ -652,78 +879,6 @@ std::error_code createELFDumper(const object::ObjectFile *Obj,
 
 } // end namespace llvm
 
-// Iterate through the versions needed section, and place each Elf_Vernaux
-// in the VersionMap according to its index.
-template <class ELFT>
-void ELFDumper<ELFT>::LoadVersionNeeds(const Elf_Shdr *Sec) const {
-  unsigned VerneedSize = Sec->sh_size;    // Size of section in bytes
-  unsigned VerneedEntries = Sec->sh_info; // Number of Verneed entries
-  const uint8_t *VerneedStart = reinterpret_cast<const uint8_t *>(
-      ObjF->getELFFile()->base() + Sec->sh_offset);
-  const uint8_t *VerneedEnd = VerneedStart + VerneedSize;
-  // The first Verneed entry is at the start of the section.
-  const uint8_t *VerneedBuf = VerneedStart;
-  for (unsigned VerneedIndex = 0; VerneedIndex < VerneedEntries;
-       ++VerneedIndex) {
-    if (VerneedBuf + sizeof(Elf_Verneed) > VerneedEnd)
-      report_fatal_error("Section ended unexpectedly while scanning "
-                         "version needed records.");
-    const Elf_Verneed *Verneed =
-        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
-    if (Verneed->vn_version != ELF::VER_NEED_CURRENT)
-      report_fatal_error("Unexpected verneed version");
-    // Iterate through the Vernaux entries
-    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
-    for (unsigned VernauxIndex = 0; VernauxIndex < Verneed->vn_cnt;
-         ++VernauxIndex) {
-      if (VernauxBuf + sizeof(Elf_Vernaux) > VerneedEnd)
-        report_fatal_error("Section ended unexpected while scanning auxiliary "
-                           "version needed records.");
-      if ((ptrdiff_t)VernauxBuf % sizeof(uint32_t) != 0)
-        reportError(createError("SHT_GNU_verneed: the vn_aux field of the "
-                                "entry with index " +
-                                Twine(VerneedIndex) +
-                                " references a misaligned auxiliary record"),
-                    ObjF->getFileName());
-
-      const Elf_Vernaux *Vernaux =
-          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
-      size_t Index = Vernaux->vna_other & ELF::VERSYM_VERSION;
-      if (Index >= VersionMap.size())
-        VersionMap.resize(Index + 1);
-      VersionMap[Index] = VersionMapEntry(Vernaux);
-      VernauxBuf += Vernaux->vna_next;
-    }
-    VerneedBuf += Verneed->vn_next;
-  }
-}
-
-// Iterate through the version definitions, and place each Elf_Verdef
-// in the VersionMap according to its index.
-template <class ELFT>
-void ELFDumper<ELFT>::LoadVersionDefs(const Elf_Shdr *Sec) const {
-  unsigned VerdefSize = Sec->sh_size;    // Size of section in bytes
-  unsigned VerdefEntries = Sec->sh_info; // Number of Verdef entries
-  const uint8_t *VerdefStart = reinterpret_cast<const uint8_t *>(
-      ObjF->getELFFile()->base() + Sec->sh_offset);
-  const uint8_t *VerdefEnd = VerdefStart + VerdefSize;
-  // The first Verdef entry is at the start of the section.
-  const uint8_t *VerdefBuf = VerdefStart;
-  for (unsigned VerdefIndex = 0; VerdefIndex < VerdefEntries; ++VerdefIndex) {
-    if (VerdefBuf + sizeof(Elf_Verdef) > VerdefEnd)
-      report_fatal_error("Section ended unexpectedly while scanning "
-                         "version definitions.");
-    const Elf_Verdef *Verdef = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
-    if (Verdef->vd_version != ELF::VER_DEF_CURRENT)
-      report_fatal_error("Unexpected verdef version");
-    size_t Index = Verdef->vd_ndx & ELF::VERSYM_VERSION;
-    if (Index >= VersionMap.size())
-      VersionMap.resize(Index + 1);
-    VersionMap[Index] = VersionMapEntry(Verdef);
-    VerdefBuf += Verdef->vd_next;
-  }
-}
-
 template <class ELFT> void ELFDumper<ELFT>::LoadVersionMap() const {
   // If there is no dynamic symtab or version table, there is nothing to do.
   if (!DynSymRegion.Addr || !SymbolVersionSection)
@@ -735,19 +890,37 @@ template <class ELFT> void ELFDumper<ELFT>::LoadVersionMap() const {
 
   // The first two version indexes are reserved.
   // Index 0 is LOCAL, index 1 is GLOBAL.
-  VersionMap.push_back(VersionMapEntry());
-  VersionMap.push_back(VersionMapEntry());
+  VersionMap.push_back(VersionEntry());
+  VersionMap.push_back(VersionEntry());
 
-  if (SymbolVersionDefSection)
-    LoadVersionDefs(SymbolVersionDefSection);
+  auto InsertEntry = [this](unsigned N, StringRef Version, bool IsVerdef) {
+    if (N >= VersionMap.size())
+      VersionMap.resize(N + 1);
+    VersionMap[N] = {Version, IsVerdef};
+  };
 
-  if (SymbolVersionNeedSection)
-    LoadVersionNeeds(SymbolVersionNeedSection);
+  if (SymbolVersionDefSection) {
+    Expected<std::vector<VerDef>> Defs =
+        this->getVersionDefinitions(SymbolVersionDefSection);
+    if (!Defs)
+      reportError(Defs.takeError(), ObjF->getFileName());
+    for (const VerDef &Def : *Defs)
+      InsertEntry(Def.Ndx & ELF::VERSYM_VERSION, Def.Name, true);
+  }
+
+  if (SymbolVersionNeedSection) {
+    Expected<std::vector<VerNeed>> Deps =
+        this->getVersionDependencies(SymbolVersionNeedSection);
+    if (!Deps)
+      reportError(Deps.takeError(), ObjF->getFileName());
+    for (const VerNeed &Dep : *Deps)
+      for (const VernAux &Aux : Dep.AuxV)
+        InsertEntry(Aux.Other & ELF::VERSYM_VERSION, Aux.Name, false);
+  }
 }
 
 template <typename ELFT>
-StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
-                                            const Elf_Sym *Sym,
+StringRef ELFDumper<ELFT>::getSymbolVersion(const Elf_Sym *Sym,
                                             bool &IsDefault) const {
   // This is a dynamic symbol. Look in the GNU symbol version table.
   if (!SymbolVersionSection) {
@@ -765,7 +938,7 @@ StringRef ELFDumper<ELFT>::getSymbolVersion(StringRef StrTab,
   const Elf_Versym *Versym = unwrapOrError(
       ObjF->getFileName(), ObjF->getELFFile()->template getEntry<Elf_Versym>(
                                SymbolVersionSection, EntryIndex));
-  return this->getSymbolVersionByIndex(StrTab, Versym->vs_index, IsDefault);
+  return this->getSymbolVersionByIndex(Versym->vs_index, IsDefault);
 }
 
 static std::string maybeDemangle(StringRef Name) {
@@ -792,8 +965,7 @@ ELFDumper<ELFT>::getStaticSymbolName(uint32_t Index) const {
 }
 
 template <typename ELFT>
-StringRef ELFDumper<ELFT>::getSymbolVersionByIndex(StringRef StrTab,
-                                                   uint32_t SymbolVersionIndex,
+StringRef ELFDumper<ELFT>::getSymbolVersionByIndex(uint32_t SymbolVersionIndex,
                                                    bool &IsDefault) const {
   size_t VersionIndex = SymbolVersionIndex & VERSYM_VERSION;
 
@@ -805,23 +977,15 @@ StringRef ELFDumper<ELFT>::getSymbolVersionByIndex(StringRef StrTab,
 
   // Lookup this symbol in the version table.
   LoadVersionMap();
-  if (VersionIndex >= VersionMap.size() || VersionMap[VersionIndex].isNull())
+  if (VersionIndex >= VersionMap.size() || !VersionMap[VersionIndex])
     reportError(createError("Invalid version entry"), ObjF->getFileName());
-  const VersionMapEntry &Entry = VersionMap[VersionIndex];
 
-  // Get the version name string.
-  size_t NameOffset;
-  if (Entry.isVerdef()) {
-    // The first Verdaux entry holds the name.
-    NameOffset = Entry.getVerdef()->getAux()->vda_name;
+  const VersionEntry &Entry = *VersionMap[VersionIndex];
+  if (Entry.IsVerDef)
     IsDefault = !(SymbolVersionIndex & VERSYM_HIDDEN);
-  } else {
-    NameOffset = Entry.getVernaux()->vna_name;
+  else
     IsDefault = false;
-  }
-  if (NameOffset >= StrTab.size())
-    reportError(createError("Invalid string offset"), ObjF->getFileName());
-  return StrTab.data() + NameOffset;
+  return Entry.Name.c_str();
 }
 
 template <typename ELFT>
@@ -852,7 +1016,7 @@ std::string ELFDumper<ELFT>::getFullSymbolName(const Elf_Sym *Symbol,
     return SymbolName;
 
   bool IsDefault;
-  StringRef Version = getSymbolVersion(StrTable, &*Symbol, IsDefault);
+  StringRef Version = getSymbolVersion(&*Symbol, IsDefault);
   if (!Version.empty()) {
     SymbolName += (IsDefault ? "@@" : "@");
     SymbolName += Version;
@@ -1274,6 +1438,7 @@ static const char *getElfSegmentType(unsigned Arch, unsigned Type) {
 
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_STACK);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_RELRO);
+    LLVM_READOBJ_ENUM_CASE(ELF, PT_GNU_PROPERTY);
 
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_RANDOMIZE);
     LLVM_READOBJ_ENUM_CASE(ELF, PT_OPENBSD_WXNEEDED);
@@ -1298,6 +1463,7 @@ static std::string getElfPtType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_PHDR_ENUM(ELF, PT_SUNW_UNWIND)
     LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_STACK)
     LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_RELRO)
+    LLVM_READOBJ_PHDR_ENUM(ELF, PT_GNU_PROPERTY)
   default:
     // All machine specific PT_* types
     switch (Arch) {
@@ -3805,18 +3971,29 @@ void GNUStyle<ELFT>::printDynamicRelocations(const ELFO *Obj) {
 }
 
 template <class ELFT>
-static void printGNUVersionSectionProlog(formatted_raw_ostream &OS,
-                                         const Twine &Name, unsigned EntriesNum,
-                                         const ELFFile<ELFT> *Obj,
-                                         const typename ELFT::Shdr *Sec,
-                                         StringRef FileName) {
-  StringRef SecName = unwrapOrError(FileName, Obj->getSectionName(Sec));
-  OS << Name << " section '" << SecName << "' "
+void GNUStyle<ELFT>::printGNUVersionSectionProlog(
+    const ELFFile<ELFT> *Obj, const typename ELFT::Shdr *Sec,
+    const Twine &Label, unsigned EntriesNum) {
+  StringRef SecName = unwrapOrError(this->FileName, Obj->getSectionName(Sec));
+  OS << Label << " section '" << SecName << "' "
      << "contains " << EntriesNum << " entries:\n";
 
-  const typename ELFT::Shdr *SymTab =
-      unwrapOrError(FileName, Obj->getSection(Sec->sh_link));
-  StringRef SymTabName = unwrapOrError(FileName, Obj->getSectionName(SymTab));
+  unsigned SecNdx = Sec - &cantFail(Obj->sections()).front();
+  StringRef SymTabName = "<corrupt>";
+
+  Expected<const typename ELFT::Shdr *> SymTabOrErr =
+      Obj->getSection(Sec->sh_link);
+  if (SymTabOrErr)
+    SymTabName =
+        unwrapOrError(this->FileName, Obj->getSectionName(*SymTabOrErr));
+  else
+    this->reportUniqueWarning(
+        createError("invalid section linked to " +
+                    object::getELFSectionTypeName(Obj->getHeader()->e_machine,
+                                                  Sec->sh_type) +
+                    " section with index " + Twine(SecNdx) + ": " +
+                    toString(SymTabOrErr.takeError())));
+
   OS << " Addr: " << format_hex_no_prefix(Sec->sh_addr, 16)
      << "  Offset: " << format_hex(Sec->sh_offset, 8)
      << "  Link: " << Sec->sh_link << " (" << SymTabName << ")\n";
@@ -3829,13 +4006,11 @@ void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
     return;
 
   unsigned Entries = Sec->sh_size / sizeof(Elf_Versym);
-  printGNUVersionSectionProlog(OS, "Version symbols", Entries, Obj, Sec,
-                               this->FileName);
+  printGNUVersionSectionProlog(Obj, Sec, "Version symbols", Entries);
 
   const uint8_t *VersymBuf =
       reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
   const ELFDumper<ELFT> *Dumper = this->dumper();
-  StringRef StrTable = Dumper->getDynamicStringTable();
 
   // readelf prints 4 entries per line.
   for (uint64_t VersymRow = 0; VersymRow < Entries; VersymRow += 4) {
@@ -3854,17 +4029,17 @@ void GNUStyle<ELFT>::printVersionSymbolSection(const ELFFile<ELFT> *Obj,
         OS << "   1 (*global*)   ";
         break;
       default:
-        OS << format("%4x%c", Versym->vs_index & VERSYM_VERSION,
-                     Versym->vs_index & VERSYM_HIDDEN ? 'h' : ' ');
-
         bool IsDefault = true;
-        std::string VersionName = Dumper->getSymbolVersionByIndex(
-            StrTable, Versym->vs_index, IsDefault);
+        std::string VersionName =
+            Dumper->getSymbolVersionByIndex(Versym->vs_index, IsDefault);
 
         if (!VersionName.empty())
           VersionName = "(" + VersionName + ")";
         else
           VersionName = "(*invalid*)";
+
+        OS << format("%4x%c", Versym->vs_index & VERSYM_VERSION,
+                     Versym->vs_index & VERSYM_HIDDEN ? 'h' : ' ');
         OS << left_justify(VersionName, 13);
       }
       VersymBuf += sizeof(Elf_Versym);
@@ -3901,42 +4076,25 @@ void GNUStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
   if (!Sec)
     return;
 
-  unsigned VerDefsNum = Sec->sh_info;
-  printGNUVersionSectionProlog(OS, "Version definition", VerDefsNum, Obj, Sec,
-                               this->FileName);
+  printGNUVersionSectionProlog(Obj, Sec, "Version definition", Sec->sh_info);
 
-  const Elf_Shdr *StrTabSec =
-      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
-  StringRef StringTable(
-      reinterpret_cast<const char *>(Obj->base() + StrTabSec->sh_offset),
-      (size_t)StrTabSec->sh_size);
-
-  const uint8_t *VerdefBuf =
-      unwrapOrError(this->FileName, Obj->getSectionContents(Sec)).data();
-  const uint8_t *Begin = VerdefBuf;
-
-  while (VerDefsNum--) {
-    const Elf_Verdef *Verdef = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
-    OS << format("  0x%04x: Rev: %u  Flags: %s  Index: %u  Cnt: %u",
-                 VerdefBuf - Begin, (unsigned)Verdef->vd_version,
-                 versionFlagToString(Verdef->vd_flags).c_str(),
-                 (unsigned)Verdef->vd_ndx, (unsigned)Verdef->vd_cnt);
-
-    const uint8_t *VerdauxBuf = VerdefBuf + Verdef->vd_aux;
-    const Elf_Verdaux *Verdaux =
-        reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
-    OS << format("  Name: %s\n",
-                 StringTable.drop_front(Verdaux->vda_name).data());
-
-    for (unsigned I = 1; I < Verdef->vd_cnt; ++I) {
-      VerdauxBuf += Verdaux->vda_next;
-      Verdaux = reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
-      OS << format("  0x%04x: Parent %u: %s\n", VerdauxBuf - Begin, I,
-                   StringTable.drop_front(Verdaux->vda_name).data());
-    }
+  Expected<std::vector<VerDef>> V = this->dumper()->getVersionDefinitions(Sec);
+  if (!V) {
+    this->reportUniqueWarning(V.takeError());
+    return;
+  }
 
-    VerdefBuf += Verdef->vd_next;
+  for (const VerDef &Def : *V) {
+    OS << format("  0x%04x: Rev: %u  Flags: %s  Index: %u  Cnt: %u  Name: %s\n",
+                 Def.Offset, Def.Version,
+                 versionFlagToString(Def.Flags).c_str(), Def.Ndx, Def.Cnt,
+                 Def.Name.data());
+    unsigned I = 0;
+    for (const VerdAux &Aux : Def.AuxV)
+      OS << format("  0x%04x: Parent %u: %s\n", Aux.Offset, ++I,
+                   Aux.Name.data());
   }
+
   OS << '\n';
 }
 
@@ -3947,48 +4105,22 @@ void GNUStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
     return;
 
   unsigned VerneedNum = Sec->sh_info;
-  printGNUVersionSectionProlog(OS, "Version needs", VerneedNum, Obj, Sec,
-                               this->FileName);
-
-  ArrayRef<uint8_t> SecData =
-      unwrapOrError(this->FileName, Obj->getSectionContents(Sec));
+  printGNUVersionSectionProlog(Obj, Sec, "Version needs", VerneedNum);
 
-  const Elf_Shdr *StrTabSec =
-      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
-  StringRef StringTable = {
-      reinterpret_cast<const char *>(Obj->base() + StrTabSec->sh_offset),
-      (size_t)StrTabSec->sh_size};
-
-  const uint8_t *VerneedBuf = SecData.data();
-  for (unsigned I = 0; I < VerneedNum; ++I) {
-    const Elf_Verneed *Verneed =
-        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
-
-    StringRef File = StringTable.size() > Verneed->vn_file
-                         ? StringTable.drop_front(Verneed->vn_file)
-                         : "<invalid>";
-
-    OS << format("  0x%04x: Version: %u  File: %s  Cnt: %u\n",
-                 reinterpret_cast<const uint8_t *>(Verneed) - SecData.begin(),
-                 (unsigned)Verneed->vn_version, File.data(),
-                 (unsigned)Verneed->vn_cnt);
-
-    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
-    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
-      const Elf_Vernaux *Vernaux =
-          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
-
-      StringRef Name = StringTable.size() > Vernaux->vna_name
-                           ? StringTable.drop_front(Vernaux->vna_name)
-                           : "<invalid>";
+  Expected<std::vector<VerNeed>> V =
+      this->dumper()->getVersionDependencies(Sec);
+  if (!V) {
+    this->reportUniqueWarning(V.takeError());
+    return;
+  }
 
-      OS << format("  0x%04x:   Name: %s  Flags: %s  Version: %u\n",
-                   reinterpret_cast<const uint8_t *>(Vernaux) - SecData.begin(),
-                   Name.data(), versionFlagToString(Vernaux->vna_flags).c_str(),
-                   (unsigned)Vernaux->vna_other);
-      VernauxBuf += Vernaux->vna_next;
-    }
-    VerneedBuf += Verneed->vn_next;
+  for (const VerNeed &VN : *V) {
+    OS << format("  0x%04x: Version: %u  File: %s  Cnt: %u\n", VN.Offset,
+                 VN.Version, VN.File.data(), VN.Cnt);
+    for (const VernAux &Aux : VN.AuxV)
+      OS << format("  0x%04x:   Name: %s  Flags: %s  Version: %u\n", Aux.Offset,
+                   Aux.Name.data(), versionFlagToString(Aux.Flags).c_str(),
+                   Aux.Other);
   }
   OS << '\n';
 }
@@ -5713,44 +5845,22 @@ void LLVMStyle<ELFT>::printVersionDefinitionSection(const ELFFile<ELFT> *Obj,
   if (!Sec)
     return;
 
-  const uint8_t *SecStartAddress =
-      reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
-  const uint8_t *SecEndAddress = SecStartAddress + Sec->sh_size;
-  const uint8_t *VerdefBuf = SecStartAddress;
-  const Elf_Shdr *StrTab =
-      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
-
-  unsigned VerDefsNum = Sec->sh_info;
-  while (VerDefsNum--) {
-    if (VerdefBuf + sizeof(Elf_Verdef) > SecEndAddress)
-      // FIXME: report_fatal_error is not a good way to report error. We should
-      // emit a parsing error here and below.
-      report_fatal_error("invalid offset in the section");
+  Expected<std::vector<VerDef>> V = this->dumper()->getVersionDefinitions(Sec);
+  if (!V) {
+    this->reportUniqueWarning(V.takeError());
+    return;
+  }
 
-    const Elf_Verdef *Verdef = reinterpret_cast<const Elf_Verdef *>(VerdefBuf);
+  for (const VerDef &D : *V) {
     DictScope Def(W, "Definition");
-    W.printNumber("Version", Verdef->vd_version);
-    W.printFlags("Flags", Verdef->vd_flags, makeArrayRef(SymVersionFlags));
-    W.printNumber("Index", Verdef->vd_ndx);
-    W.printNumber("Hash", Verdef->vd_hash);
-    W.printString("Name", StringRef(reinterpret_cast<const char *>(
-                              Obj->base() + StrTab->sh_offset +
-                              Verdef->getAux()->vda_name)));
-    if (!Verdef->vd_cnt)
-      report_fatal_error("at least one definition string must exist");
-    if (Verdef->vd_cnt > 2)
-      report_fatal_error("more than one predecessor is not expected");
-
-    if (Verdef->vd_cnt == 2) {
-      const uint8_t *VerdauxBuf =
-          VerdefBuf + Verdef->vd_aux + Verdef->getAux()->vda_next;
-      const Elf_Verdaux *Verdaux =
-          reinterpret_cast<const Elf_Verdaux *>(VerdauxBuf);
-      W.printString("Predecessor",
-                    StringRef(reinterpret_cast<const char *>(
-                        Obj->base() + StrTab->sh_offset + Verdaux->vda_name)));
-    }
-    VerdefBuf += Verdef->vd_next;
+    W.printNumber("Version", D.Version);
+    W.printFlags("Flags", D.Flags, makeArrayRef(SymVersionFlags));
+    W.printNumber("Index", D.Ndx);
+    W.printNumber("Hash", D.Hash);
+    W.printString("Name", D.Name.c_str());
+    W.printList(
+        "Predecessors", D.AuxV,
+        [](raw_ostream &OS, const VerdAux &Aux) { OS << Aux.Name.c_str(); });
   }
 }
 
@@ -5761,45 +5871,27 @@ void LLVMStyle<ELFT>::printVersionDependencySection(const ELFFile<ELFT> *Obj,
   if (!Sec)
     return;
 
-  const uint8_t *SecData =
-      reinterpret_cast<const uint8_t *>(Obj->base() + Sec->sh_offset);
-  const Elf_Shdr *StrTabSec =
-      unwrapOrError(this->FileName, Obj->getSection(Sec->sh_link));
-  StringRef StringTable = {
-      reinterpret_cast<const char *>(Obj->base() + StrTabSec->sh_offset),
-      (size_t)StrTabSec->sh_size};
+  Expected<std::vector<VerNeed>> V =
+      this->dumper()->getVersionDependencies(Sec);
+  if (!V) {
+    this->reportUniqueWarning(V.takeError());
+    return;
+  }
 
-  const uint8_t *VerneedBuf = SecData;
-  unsigned VerneedNum = Sec->sh_info;
-  for (unsigned I = 0; I < VerneedNum; ++I) {
-    const Elf_Verneed *Verneed =
-        reinterpret_cast<const Elf_Verneed *>(VerneedBuf);
+  for (const VerNeed &VN : *V) {
     DictScope Entry(W, "Dependency");
-    W.printNumber("Version", Verneed->vn_version);
-    W.printNumber("Count", Verneed->vn_cnt);
-
-    StringRef FileName = StringTable.size() > Verneed->vn_file
-                             ? StringTable.drop_front(Verneed->vn_file)
-                             : "<invalid>";
-    W.printString("FileName", FileName.data());
+    W.printNumber("Version", VN.Version);
+    W.printNumber("Count", VN.Cnt);
+    W.printString("FileName", VN.File.c_str());
 
-    const uint8_t *VernauxBuf = VerneedBuf + Verneed->vn_aux;
     ListScope L(W, "Entries");
-    for (unsigned J = 0; J < Verneed->vn_cnt; ++J) {
-      const Elf_Vernaux *Vernaux =
-          reinterpret_cast<const Elf_Vernaux *>(VernauxBuf);
+    for (const VernAux &Aux : VN.AuxV) {
       DictScope Entry(W, "Entry");
-      W.printNumber("Hash", Vernaux->vna_hash);
-      W.printFlags("Flags", Vernaux->vna_flags, makeArrayRef(SymVersionFlags));
-      W.printNumber("Index", Vernaux->vna_other);
-
-      StringRef Name = StringTable.size() > Vernaux->vna_name
-                           ? StringTable.drop_front(Vernaux->vna_name)
-                           : "<invalid>";
-      W.printString("Name", Name.data());
-      VernauxBuf += Vernaux->vna_next;
+      W.printNumber("Hash", Aux.Hash);
+      W.printFlags("Flags", Aux.Flags, makeArrayRef(SymVersionFlags));
+      W.printNumber("Index", Aux.Other);
+      W.printString("Name", Aux.Name.c_str());
     }
-    VerneedBuf += Verneed->vn_next;
   }
 }
 
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 54ce87d479790..c9bc0309a6541 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -150,6 +150,12 @@ static cl::opt<std::string>
     ClFallbackDebugPath("fallback-debug-path", cl::init(""),
                         cl::desc("Fallback path for debug binaries."));
 
+static cl::list<std::string>
+    ClDebugFileDirectory("debug-file-directory", cl::ZeroOrMore,
+                         cl::value_desc("dir"),
+                         cl::desc("Path to directory where to look for debug "
+                                  "files."));
+
 static cl::opt<DIPrinter::OutputStyle>
     ClOutputStyle("output-style", cl::init(DIPrinter::OutputStyle::LLVM),
                   cl::desc("Specify print style"),
@@ -299,6 +305,7 @@ int main(int argc, char **argv) {
   Opts.DefaultArch = ClDefaultArch;
   Opts.FallbackDebugPath = ClFallbackDebugPath;
   Opts.DWPName = ClDwpName;
+  Opts.DebugFileDirectory = ClDebugFileDirectory;
 
   for (const auto &hint : ClDsymHint) {
     if (sys::path::extension(hint) == ".dSYM") {
diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp
index 3dc48b8b8802a..75d2e427b4f2b 100644
--- a/llvm/tools/obj2yaml/elf2yaml.cpp
+++ b/llvm/tools/obj2yaml/elf2yaml.cpp
@@ -215,9 +215,11 @@ template <class ELFT> Expected<ELFYAML::Object *> ELFDumper<ELFT>::dump() {
       return std::move(E);
   }
 
-  if (DynSymTab)
-    if (Error E = dumpSymbols(DynSymTab, Y->DynamicSymbols))
+  if (DynSymTab) {
+    Y->DynamicSymbols.emplace();
+    if (Error E = dumpSymbols(DynSymTab, *Y->DynamicSymbols))
       return std::move(E);
+  }
 
   for (const Elf_Shdr &Sec : Sections) {
     switch (Sec.sh_type) {
@@ -923,6 +925,8 @@ ELFDumper<ELFT>::dumpVerdefSection(const Elf_Shdr *Shdr) {
   if (!Contents)
     return Contents.takeError();
 
+  S->Entries.emplace();
+
   llvm::ArrayRef<uint8_t> Data = *Contents;
   const uint8_t *Buf = Data.data();
   while (Buf) {
@@ -942,7 +946,7 @@ ELFDumper<ELFT>::dumpVerdefSection(const Elf_Shdr *Shdr) {
       BufAux = Verdaux->vda_next ? BufAux + Verdaux->vda_next : nullptr;
     }
 
-    S->Entries.push_back(Entry);
+    S->Entries->push_back(Entry);
     Buf = Verdef->vd_next ? Buf + Verdef->vd_next : nullptr;
   }
 
@@ -991,6 +995,8 @@ ELFDumper<ELFT>::dumpVerneedSection(const Elf_Shdr *Shdr) {
   if (!StringTableOrErr)
     return StringTableOrErr.takeError();
 
+  S->VerneedV.emplace();
+
   llvm::ArrayRef<uint8_t> Data = *Contents;
   const uint8_t *Buf = Data.data();
   while (Buf) {
@@ -1017,7 +1023,7 @@ ELFDumper<ELFT>::dumpVerneedSection(const Elf_Shdr *Shdr) {
       BufAux = Vernaux->vna_next ? BufAux + Vernaux->vna_next : nullptr;
     }
 
-    S->VerneedV.push_back(Entry);
+    S->VerneedV->push_back(Entry);
     Buf = Verneed->vn_next ? Buf + Verneed->vn_next : nullptr;
   }
 
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index 90730e324c7d4..4ea9baf447a99 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -42,7 +42,3 @@ export_executable_symbols(opt)
 if(WITH_POLLY AND LINK_POLLY_INTO_TOOLS)
   target_link_libraries(opt PRIVATE Polly)
 endif(WITH_POLLY AND LINK_POLLY_INTO_TOOLS)
-
-if(LLVM_BUILD_EXAMPLES)
-    target_link_libraries(opt PRIVATE ExampleIRTransforms)
-endif(LLVM_BUILD_EXAMPLES)
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 092932237fd67..e701000f6974e 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -482,10 +482,6 @@ static TargetMachine* GetTargetMachine(Triple TheTriple, StringRef CPUStr,
                                         getCodeModel(), GetCodeGenOptLevel());
 }
 
-#ifdef BUILD_EXAMPLES
-void initializeExampleIRTransforms(llvm::PassRegistry &Registry);
-#endif
-
 #ifdef LINK_POLLY_INTO_TOOLS
 namespace polly {
 void initializePollyPasses(llvm::PassRegistry &Registry);
@@ -566,10 +562,7 @@ int main(int argc, char **argv) {
   initializeWasmEHPreparePass(Registry);
   initializeWriteBitcodePassPass(Registry);
   initializeHardwareLoopsPass(Registry);
-
-#ifdef BUILD_EXAMPLES
-  initializeExampleIRTransforms(Registry);
-#endif
+  initializeTypePromotionPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 656945b325e01..518b207e23e76 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -520,9 +520,9 @@ TEST(APFloatTest, FMA) {
 
   // Test x87 extended precision case from http://llvm.org/PR20728.
   {
-    APFloat M1(APFloat::x87DoubleExtended(), 1.0);
-    APFloat M2(APFloat::x87DoubleExtended(), 1.0);
-    APFloat A(APFloat::x87DoubleExtended(), 3.0);
+    APFloat M1(APFloat::x87DoubleExtended(), 1);
+    APFloat M2(APFloat::x87DoubleExtended(), 1);
+    APFloat A(APFloat::x87DoubleExtended(), 3);
 
     bool losesInfo = false;
     M1.fusedMultiplyAdd(M1, A, APFloat::rmNearestTiesToEven);
@@ -600,9 +600,9 @@ TEST(APFloatTest, Denormal) {
   {
     const char *MinNormalStr = "1.17549435082228750797e-38";
     EXPECT_FALSE(APFloat(APFloat::IEEEsingle(), MinNormalStr).isDenormal());
-    EXPECT_FALSE(APFloat(APFloat::IEEEsingle(), 0.0).isDenormal());
+    EXPECT_FALSE(APFloat(APFloat::IEEEsingle(), 0).isDenormal());
 
-    APFloat Val2(APFloat::IEEEsingle(), 2.0e0);
+    APFloat Val2(APFloat::IEEEsingle(), 2);
     APFloat T(APFloat::IEEEsingle(), MinNormalStr);
     T.divide(Val2, rdmd);
     EXPECT_TRUE(T.isDenormal());
@@ -612,9 +612,9 @@ TEST(APFloatTest, Denormal) {
   {
     const char *MinNormalStr = "2.22507385850720138309e-308";
     EXPECT_FALSE(APFloat(APFloat::IEEEdouble(), MinNormalStr).isDenormal());
-    EXPECT_FALSE(APFloat(APFloat::IEEEdouble(), 0.0).isDenormal());
+    EXPECT_FALSE(APFloat(APFloat::IEEEdouble(), 0).isDenormal());
 
-    APFloat Val2(APFloat::IEEEdouble(), 2.0e0);
+    APFloat Val2(APFloat::IEEEdouble(), 2);
     APFloat T(APFloat::IEEEdouble(), MinNormalStr);
     T.divide(Val2, rdmd);
     EXPECT_TRUE(T.isDenormal());
@@ -624,9 +624,9 @@ TEST(APFloatTest, Denormal) {
   {
     const char *MinNormalStr = "3.36210314311209350626e-4932";
     EXPECT_FALSE(APFloat(APFloat::x87DoubleExtended(), MinNormalStr).isDenormal());
-    EXPECT_FALSE(APFloat(APFloat::x87DoubleExtended(), 0.0).isDenormal());
+    EXPECT_FALSE(APFloat(APFloat::x87DoubleExtended(), 0).isDenormal());
 
-    APFloat Val2(APFloat::x87DoubleExtended(), 2.0e0);
+    APFloat Val2(APFloat::x87DoubleExtended(), 2);
     APFloat T(APFloat::x87DoubleExtended(), MinNormalStr);
     T.divide(Val2, rdmd);
     EXPECT_TRUE(T.isDenormal());
@@ -636,9 +636,9 @@ TEST(APFloatTest, Denormal) {
   {
     const char *MinNormalStr = "3.36210314311209350626267781732175260e-4932";
     EXPECT_FALSE(APFloat(APFloat::IEEEquad(), MinNormalStr).isDenormal());
-    EXPECT_FALSE(APFloat(APFloat::IEEEquad(), 0.0).isDenormal());
+    EXPECT_FALSE(APFloat(APFloat::IEEEquad(), 0).isDenormal());
 
-    APFloat Val2(APFloat::IEEEquad(), 2.0e0);
+    APFloat Val2(APFloat::IEEEquad(), 2);
     APFloat T(APFloat::IEEEquad(), MinNormalStr);
     T.divide(Val2, rdmd);
     EXPECT_TRUE(T.isDenormal());
@@ -1153,8 +1153,8 @@ TEST(APFloatTest, makeNaN) {
 #ifdef GTEST_HAS_DEATH_TEST
 #ifndef NDEBUG
 TEST(APFloatTest, SemanticsDeath) {
-  EXPECT_DEATH(APFloat(APFloat::IEEEsingle(), 0.0f).convertToDouble(), "Float semantics are not IEEEdouble");
-  EXPECT_DEATH(APFloat(APFloat::IEEEdouble(), 0.0 ).convertToFloat(),  "Float semantics are not IEEEsingle");
+  EXPECT_DEATH(APFloat(APFloat::IEEEsingle(), 0).convertToDouble(), "Float semantics are not IEEEdouble");
+  EXPECT_DEATH(APFloat(APFloat::IEEEdouble(), 0).convertToFloat(),  "Float semantics are not IEEEsingle");
 }
 
 TEST(APFloatTest, StringDecimalDeath) {
diff --git a/llvm/unittests/ADT/APIntTest.cpp b/llvm/unittests/ADT/APIntTest.cpp
index 2a3abbfaf1645..b6fee5b7ff1aa 100644
--- a/llvm/unittests/ADT/APIntTest.cpp
+++ b/llvm/unittests/ADT/APIntTest.cpp
@@ -2845,4 +2845,21 @@ TEST(APIntTest, GetMostSignificantDifferentBitExaustive) {
   }
 }
 
+TEST(APIntTest, SignbitZeroChecks) {
+  EXPECT_TRUE(APInt(8, -1).isNegative());
+  EXPECT_FALSE(APInt(8, -1).isNonNegative());
+  EXPECT_FALSE(APInt(8, -1).isStrictlyPositive());
+  EXPECT_TRUE(APInt(8, -1).isNonPositive());
+
+  EXPECT_FALSE(APInt(8, 0).isNegative());
+  EXPECT_TRUE(APInt(8, 0).isNonNegative());
+  EXPECT_FALSE(APInt(8, 0).isStrictlyPositive());
+  EXPECT_TRUE(APInt(8, 0).isNonPositive());
+
+  EXPECT_FALSE(APInt(8, 1).isNegative());
+  EXPECT_TRUE(APInt(8, 1).isNonNegative());
+  EXPECT_TRUE(APInt(8, 1).isStrictlyPositive());
+  EXPECT_FALSE(APInt(8, 1).isNonPositive());
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h
index 36d298255c1b7..3e6d4e14d5c11 100644
--- a/llvm/unittests/ADT/TestGraph.h
+++ b/llvm/unittests/ADT/TestGraph.h
@@ -175,8 +175,8 @@ class Graph {
 
   public:
     /// ChildIterator - Copy constructor.
-    ChildIterator(const ChildIterator& other) : FirstNode(other.FirstNode),
-      Children(other.Children) {}
+    ChildIterator(const ChildIterator &other) = default;
+    ChildIterator &operator=(const ChildIterator &other) = default;
 
     /// Comparison operators.
     bool operator==(const ChildIterator &other) const {
diff --git a/llvm/unittests/Analysis/VectorFunctionABITest.cpp b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
index 6fe85298d5496..ccf996b1b6ecb 100644
--- a/llvm/unittests/Analysis/VectorFunctionABITest.cpp
+++ b/llvm/unittests/Analysis/VectorFunctionABITest.cpp
@@ -89,7 +89,7 @@ class VFABIParserTest : public ::testing::Test {
 protected:
   // Referencies to the parser output field.
   unsigned &VF = Info.Shape.VF;
-  VFISAKind &ISA = Info.Shape.ISA;
+  VFISAKind &ISA = Info.ISA;
   SmallVector<VFParameter, 8> &Parameters = Info.Shape.Parameters;
   StringRef &ScalarName = Info.ScalarName;
   StringRef &VectorName = Info.VectorName;
diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp
index a33fdb503bb42..7a698f3082200 100644
--- a/llvm/unittests/Analysis/VectorUtilsTest.cpp
+++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp
@@ -279,3 +279,221 @@ TEST_F(VectorUtilsTest, getSplatValueElt1) {
       "}\n");
   EXPECT_EQ(getSplatValue(A), nullptr);
 }
+
+////////////////////////////////////////////////////////////////////////////////
+// VFShape API tests.
+////////////////////////////////////////////////////////////////////////////////
+
+class VFShapeAPITest : public testing::Test {
+protected:
+  void SetUp() override {
+    M = parseAssemblyString(IR, Err, Ctx);
+    // Get the only call instruction in the block, which is the first
+    // instruction.
+    CI = dyn_cast<CallInst>(&*(instructions(M->getFunction("f")).begin()));
+  }
+
+  const char *IR = "define i32 @f(i32 %a, i64 %b, double %c) {\n"
+                   " %1 = call i32 @g(i32 %a, i64 %b, double %c)\n"
+                   "  ret i32 %1\n"
+                   "}\n"
+                   "declare i32 @g(i32, i64, double)\n";
+  LLVMContext Ctx;
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M;
+  CallInst *CI;
+  // Dummy shape with no parameters, overwritten by buildShape when invoked.
+  VFShape Shape = {/*VF*/ 2, /*IsScalable*/ false, /*Parameters*/ {}};
+  VFShape Expected;
+  SmallVector<VFParameter, 8> &ExpectedParams = Expected.Parameters;
+
+  void buildShape(unsigned VF, bool IsScalable, bool HasGlobalPred) {
+    Shape = VFShape::get(*CI, {VF, IsScalable}, HasGlobalPred);
+  }
+
+  bool validParams(ArrayRef<VFParameter> Parameters) {
+    Shape.Parameters =
+        SmallVector<VFParameter, 8>(Parameters.begin(), Parameters.end());
+    return Shape.hasValidParameterList();
+  }
+};
+
+TEST_F(VFShapeAPITest, API_buildVFShape) {
+  buildShape(/*VF*/ 2, /*IsScalable*/ false, /*HasGlobalPred*/ false);
+  Expected = {/*VF*/ 2, /*IsScalable*/ false, /*Parameters*/ {
+                  {0, VFParamKind::Vector},
+                  {1, VFParamKind::Vector},
+                  {2, VFParamKind::Vector},
+              }};
+  EXPECT_EQ(Shape, Expected);
+
+  buildShape(/*VF*/ 4, /*IsScalable*/ false, /*HasGlobalPred*/ true);
+  Expected = {/*VF*/ 4, /*IsScalable*/ false, /*Parameters*/ {
+                  {0, VFParamKind::Vector},
+                  {1, VFParamKind::Vector},
+                  {2, VFParamKind::Vector},
+                  {3, VFParamKind::GlobalPredicate},
+              }};
+  EXPECT_EQ(Shape, Expected);
+
+  buildShape(/*VF*/ 16, /*IsScalable*/ true, /*HasGlobalPred*/ false);
+  Expected = {/*VF*/ 16, /*IsScalable*/ true, /*Parameters*/ {
+                  {0, VFParamKind::Vector},
+                  {1, VFParamKind::Vector},
+                  {2, VFParamKind::Vector},
+              }};
+  EXPECT_EQ(Shape, Expected);
+}
+
+TEST_F(VFShapeAPITest, API_updateVFShape) {
+
+  buildShape(/*VF*/ 2, /*IsScalable*/ false, /*HasGlobalPred*/ false);
+  Shape.updateParam({0 /*Pos*/, VFParamKind::OMP_Linear, 1, Align(4)});
+  Expected = {/*VF*/ 2, /*IsScalable*/ false, /*Parameters*/ {
+                  {0, VFParamKind::OMP_Linear, 1, Align(4)},
+                  {1, VFParamKind::Vector},
+                  {2, VFParamKind::Vector},
+              }};
+  EXPECT_EQ(Shape, Expected);
+
+  // From this point on, we update only the parameters of the VFShape,
+  // so we update only the reference of the expected Parameters.
+  Shape.updateParam({1 /*Pos*/, VFParamKind::OMP_Uniform});
+  ExpectedParams = {
+      {0, VFParamKind::OMP_Linear, 1, Align(4)},
+      {1, VFParamKind::OMP_Uniform},
+      {2, VFParamKind::Vector},
+  };
+  EXPECT_EQ(Shape, Expected);
+
+  Shape.updateParam({2 /*Pos*/, VFParamKind::OMP_LinearRefPos, 1});
+  ExpectedParams = {
+      {0, VFParamKind::OMP_Linear, 1, Align(4)},
+      {1, VFParamKind::OMP_Uniform},
+      {2, VFParamKind::OMP_LinearRefPos, 1},
+  };
+  EXPECT_EQ(Shape, Expected);
+}
+
+TEST_F(VFShapeAPITest, API_updateVFShape_GlobalPredicate) {
+
+  buildShape(/*VF*/ 2, /*IsScalable*/ true, /*HasGlobalPred*/ true);
+  Shape.updateParam({1 /*Pos*/, VFParamKind::OMP_Uniform});
+  Expected = {/*VF*/ 2, /*IsScalable*/ true,
+              /*Parameters*/ {{0, VFParamKind::Vector},
+                              {1, VFParamKind::OMP_Uniform},
+                              {2, VFParamKind::Vector},
+                              {3, VFParamKind::GlobalPredicate}}};
+  EXPECT_EQ(Shape, Expected);
+}
+
+TEST_F(VFShapeAPITest, Parameters_Valid) {
+  // ParamPos in order.
+  EXPECT_TRUE(validParams({{0, VFParamKind::Vector}}));
+  EXPECT_TRUE(
+      validParams({{0, VFParamKind::Vector}, {1, VFParamKind::Vector}}));
+  EXPECT_TRUE(validParams({{0, VFParamKind::Vector},
+                           {1, VFParamKind::Vector},
+                           {2, VFParamKind::Vector}}));
+
+  // GlocalPredicate is unique.
+  EXPECT_TRUE(validParams({{0, VFParamKind::Vector},
+                           {1, VFParamKind::Vector},
+                           {2, VFParamKind::Vector},
+                           {3, VFParamKind::GlobalPredicate}}));
+
+  EXPECT_TRUE(validParams({{0, VFParamKind::Vector},
+                           {1, VFParamKind::GlobalPredicate},
+                           {2, VFParamKind::Vector}}));
+}
+
+TEST_F(VFShapeAPITest, Parameters_ValidOpenMPLinear) {
+// Valid linear constant step (>0).
+#define __BUILD_PARAMETERS(Kind, Val)                                          \
+  {                                                                            \
+    { 0, Kind, Val }                                                           \
+  }
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_Linear, 1)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRef, 2)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearVal, 4)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUVal, 33)));
+#undef __BUILD_PARAMETERS
+
+// Valid linear runtime step (the step parameter is marked uniform).
+#define __BUILD_PARAMETERS(Kind)                                               \
+  {                                                                            \
+    {0, VFParamKind::OMP_Uniform}, {1, VFParamKind::Vector}, { 2, Kind, 0 }    \
+  }
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearPos)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRefPos)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearValPos)));
+  EXPECT_TRUE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUValPos)));
+#undef __BUILD_PARAMETERS
+}
+
+TEST_F(VFShapeAPITest, Parameters_Invalid) {
+#ifndef NDEBUG
+  // Wrong order is checked by an asseretion: make sure that the
+  // assertion is not removed.
+  EXPECT_DEATH(validParams({{1, VFParamKind::Vector}}),
+               "Broken parameter list.");
+  EXPECT_DEATH(
+      validParams({{1, VFParamKind::Vector}, {0, VFParamKind::Vector}}),
+      "Broken parameter list.");
+#endif
+
+  // GlobalPredicate is not unique
+  EXPECT_FALSE(validParams({{0, VFParamKind::Vector},
+                            {1, VFParamKind::GlobalPredicate},
+                            {2, VFParamKind::GlobalPredicate}}));
+  EXPECT_FALSE(validParams({{0, VFParamKind::GlobalPredicate},
+                            {1, VFParamKind::Vector},
+                            {2, VFParamKind::GlobalPredicate}}));
+}
+
+TEST_F(VFShapeAPITest, Parameters_InvalidOpenMPLinear) {
+// Compile time linear steps must be non-zero (compile time invariant).
+#define __BUILD_PARAMETERS(Kind)                                               \
+  {                                                                            \
+    { 0, Kind, 0 }                                                             \
+  }
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_Linear)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRef)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearVal)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUVal)));
+#undef __BUILD_PARAMETERS
+
+// The step of a runtime linear parameter must be marked
+// as uniform (runtime invariant).
+#define __BUILD_PARAMETERS(Kind)                                               \
+  {                                                                            \
+    {0, VFParamKind::OMP_Uniform}, {1, VFParamKind::Vector}, { 2, Kind, 1 }    \
+  }
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRefPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearValPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUValPos)));
+#undef __BUILD_PARAMETERS
+
+// The linear step parameter can't point at itself.
+#define __BUILD_PARAMETERS(Kind)                                               \
+  {                                                                            \
+    {0, VFParamKind::Vector}, {1, VFParamKind::Vector}, { 2, Kind, 2 }         \
+  }
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRefPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearValPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUValPos)));
+#undef __BUILD_PARAMETERS
+
+// Linear parameter (runtime) is out of range.
+#define __BUILD_PARAMETERS(Kind)                                               \
+  {                                                                            \
+    {0, VFParamKind::Vector}, {1, VFParamKind::Vector}, { 2, Kind, 3 }         \
+  }
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearRefPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearValPos)));
+  EXPECT_FALSE(validParams(__BUILD_PARAMETERS(VFParamKind::OMP_LinearUValPos)));
+#undef __BUILD_PARAMETERS
+}
diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index 3e16a50d07bdf..68a6d2ed2ca2c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -43,8 +43,9 @@ TEST_F(CoreAPIsStandardTest, BasicSuccessfulLookup) {
         FooMR = std::make_shared<MaterializationResponsibility>(std::move(R));
       })));
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            OnCompletion, NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, OnCompletion,
+            NoDependenciesToRegister);
 
   EXPECT_FALSE(OnCompletionRun) << "Should not have been resolved yet";
 
@@ -67,7 +68,7 @@ TEST_F(CoreAPIsStandardTest, ExecutionSessionFailQuery) {
     OnCompletionRun = true;
   };
 
-  AsynchronousSymbolQuery Q(SymbolNameSet({Foo}), SymbolState::Ready,
+  AsynchronousSymbolQuery Q(SymbolLookupSet(Foo), SymbolState::Ready,
                             OnCompletion);
 
   ES.legacyFailQuery(Q,
@@ -84,8 +85,8 @@ TEST_F(CoreAPIsStandardTest, EmptyLookup) {
     OnCompletionRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {}, SymbolState::Ready,
-            OnCompletion, NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(),
+            SymbolState::Ready, OnCompletion, NoDependenciesToRegister);
 
   EXPECT_TRUE(OnCompletionRun) << "OnCompletion was not run for empty query";
 }
@@ -131,7 +132,8 @@ TEST_F(CoreAPIsStandardTest, RemoveSymbolsTest) {
 
   bool OnCompletionRun = false;
   ES.lookup(
-      JITDylibSearchList({{&JD, false}}), {Foo, Baz}, SymbolState::Ready,
+      LookupKind::Static, makeJITDylibSearchOrder(&JD),
+      SymbolLookupSet({Foo, Baz}), SymbolState::Ready,
       [&](Expected<SymbolMap> Result) {
         cantFail(Result.takeError());
         OnCompletionRun = true;
@@ -179,7 +181,7 @@ TEST_F(CoreAPIsStandardTest, ChainedJITDylibLookup) {
   bool OnCompletionRun = false;
 
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
-      SymbolNameSet({Foo}), SymbolState::Ready,
+      SymbolLookupSet({Foo}), SymbolState::Ready,
       [&](Expected<SymbolMap> Result) {
         cantFail(std::move(Result));
         OnCompletionRun = true;
@@ -200,8 +202,8 @@ TEST_F(CoreAPIsStandardTest, LookupWithHiddenSymbols) {
   cantFail(JD2.define(absoluteSymbols({{Bar, QuxSym}})));
 
   /// Try a blocking lookup.
-  auto Result = cantFail(
-      ES.lookup(JITDylibSearchList({{&JD, false}, {&JD2, false}}), {Foo, Bar}));
+  auto Result = cantFail(ES.lookup(makeJITDylibSearchOrder({&JD, &JD2}),
+                                   SymbolLookupSet({Foo, Bar})));
 
   EXPECT_EQ(Result.size(), 2U) << "Unexpected number of results";
   EXPECT_EQ(Result.count(Foo), 1U) << "Missing result for \"Foo\"";
@@ -226,9 +228,9 @@ TEST_F(CoreAPIsStandardTest, LookupFlagsTest) {
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
   cantFail(JD.define(std::move(MU)));
 
-  SymbolNameSet Names({Foo, Bar, Baz});
-
-  auto SymbolFlags = cantFail(JD.lookupFlags(Names));
+  auto SymbolFlags = cantFail(JD.lookupFlags(
+      LookupKind::Static, JITDylibLookupFlags::MatchExportedSymbolsOnly,
+      SymbolLookupSet({Foo, Bar, Baz})));
 
   EXPECT_EQ(SymbolFlags.size(), 2U)
       << "Returned symbol flags contains unexpected results";
@@ -245,20 +247,24 @@ TEST_F(CoreAPIsStandardTest, LookupWithGeneratorFailure) {
 
   class BadGenerator : public JITDylib::DefinitionGenerator {
   public:
-    Expected<SymbolNameSet> tryToGenerate(JITDylib &,
-                                          const SymbolNameSet &) override {
+    Error tryToGenerate(LookupKind K, JITDylib &, JITDylibLookupFlags,
+                        const SymbolLookupSet &) override {
       return make_error<StringError>("BadGenerator", inconvertibleErrorCode());
     }
   };
 
   JD.addGenerator(std::make_unique<BadGenerator>());
 
-  EXPECT_THAT_ERROR(JD.lookupFlags({Foo}).takeError(), Failed<StringError>())
+  EXPECT_THAT_ERROR(
+      JD.lookupFlags(LookupKind::Static,
+                     JITDylibLookupFlags::MatchExportedSymbolsOnly,
+                     SymbolLookupSet(Foo))
+          .takeError(),
+      Failed<StringError>())
       << "Generator failure did not propagate through lookupFlags";
 
   EXPECT_THAT_ERROR(
-      ES.lookup(JITDylibSearchList({{&JD, false}}), SymbolNameSet({Foo}))
-          .takeError(),
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo)).takeError(),
       Failed<StringError>())
       << "Generator failure did not propagate through lookup";
 }
@@ -269,7 +275,8 @@ TEST_F(CoreAPIsStandardTest, TestBasicAliases) {
                                     {Qux, {Bar, JITSymbolFlags::Weak}}})));
   cantFail(JD.define(absoluteSymbols({{Qux, QuxSym}})));
 
-  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz, Qux});
+  auto Result =
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Baz, Qux}));
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
   EXPECT_EQ(Result->count(Qux), 1U) << "No result for \"qux\"";
@@ -284,7 +291,8 @@ TEST_F(CoreAPIsStandardTest, TestChainedAliases) {
   cantFail(JD.define(symbolAliases(
       {{Baz, {Bar, BazSym.getFlags()}}, {Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar, Baz});
+  auto Result =
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Bar, Baz}));
   EXPECT_TRUE(!!Result) << "Unexpected lookup failure";
   EXPECT_EQ(Result->count(Bar), 1U) << "No result for \"bar\"";
   EXPECT_EQ(Result->count(Baz), 1U) << "No result for \"baz\"";
@@ -303,7 +311,7 @@ TEST_F(CoreAPIsStandardTest, TestBasicReExports) {
 
   cantFail(JD2.define(reexports(JD, {{Bar, {Foo, BarSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Bar));
+  auto Result = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD2), Bar));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Bar for symbol Foo should match FooSym's address";
 }
@@ -329,7 +337,7 @@ TEST_F(CoreAPIsStandardTest, TestThatReExportsDontUnnecessarilyMaterialize) {
   cantFail(JD2.define(reexports(
       JD, {{Baz, {Foo, BazSym.getFlags()}}, {Qux, {Bar, QuxSym.getFlags()}}})));
 
-  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD2, false}}), Baz));
+  auto Result = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD2), Baz));
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Re-export Baz for symbol Foo should match FooSym's address";
 
@@ -344,13 +352,16 @@ TEST_F(CoreAPIsStandardTest, TestReexportsGenerator) {
 
   auto Filter = [this](SymbolStringPtr Name) { return Name != Bar; };
 
-  JD.addGenerator(std::make_unique<ReexportsGenerator>(JD2, false, Filter));
+  JD.addGenerator(std::make_unique<ReexportsGenerator>(
+      JD2, JITDylibLookupFlags::MatchExportedSymbolsOnly, Filter));
 
-  auto Flags = cantFail(JD.lookupFlags({Foo, Bar, Baz}));
+  auto Flags = cantFail(JD.lookupFlags(
+      LookupKind::Static, JITDylibLookupFlags::MatchExportedSymbolsOnly,
+      SymbolLookupSet({Foo, Bar, Baz})));
   EXPECT_EQ(Flags.size(), 1U) << "Unexpected number of results";
   EXPECT_EQ(Flags[Foo], FooSym.getFlags()) << "Unexpected flags for Foo";
 
-  auto Result = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
+  auto Result = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Foo));
 
   EXPECT_EQ(Result.getAddress(), FooSym.getAddress())
       << "Incorrect reexported symbol address";
@@ -370,8 +381,9 @@ TEST_F(CoreAPIsStandardTest, TestTrivialCircularDependency) {
     FooReady = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            OnCompletion, NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet({Foo}), SymbolState::Ready, OnCompletion,
+            NoDependenciesToRegister);
 
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Foo})}});
   EXPECT_THAT_ERROR(FooR->notifyResolved({{Foo, FooSym}}), Succeeded())
@@ -430,11 +442,13 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
 
   // Issue lookups for Foo. Use NoDependenciesToRegister: We're going to add
   // the dependencies manually below.
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Resolved,
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Resolved,
             std::move(OnFooResolution), NoDependenciesToRegister);
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool BarResolved = false;
   bool BarReady = false;
@@ -448,11 +462,13 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BarReady = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Resolved,
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Resolved,
             std::move(OnBarResolution), NoDependenciesToRegister);
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   bool BazResolved = false;
   bool BazReady = false;
@@ -467,11 +483,13 @@ TEST_F(CoreAPIsStandardTest, TestCircularDependenceInOneJITDylib) {
     BazReady = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz}, SymbolState::Resolved,
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Baz), SymbolState::Resolved,
             std::move(OnBazResolution), NoDependenciesToRegister);
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Baz}, SymbolState::Ready,
-            std::move(OnBazReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Baz), SymbolState::Ready, std::move(OnBazReady),
+            NoDependenciesToRegister);
 
   // Add a circular dependency: Foo -> Bar, Bar -> Baz, Baz -> Foo.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -551,8 +569,9 @@ TEST_F(CoreAPIsStandardTest, FailureInDependency) {
     OnFooReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool OnBarReadyRun = false;
   auto OnBarReady = [&](Expected<SymbolMap> Result) {
@@ -560,8 +579,9 @@ TEST_F(CoreAPIsStandardTest, FailureInDependency) {
     OnBarReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   // Add a dependency by Foo on Bar.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -614,8 +634,9 @@ TEST_F(CoreAPIsStandardTest, FailureInCircularDependency) {
     OnFooReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool OnBarReadyRun = false;
   auto OnBarReady = [&](Expected<SymbolMap> Result) {
@@ -623,8 +644,9 @@ TEST_F(CoreAPIsStandardTest, FailureInCircularDependency) {
     OnBarReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   // Add a dependency by Foo on Bar and vice-versa.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -678,8 +700,9 @@ TEST_F(CoreAPIsStandardTest, AddDependencyOnFailedSymbol) {
     OnFooReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool OnBarReadyRun = false;
   auto OnBarReady = [&](Expected<SymbolMap> Result) {
@@ -687,8 +710,9 @@ TEST_F(CoreAPIsStandardTest, AddDependencyOnFailedSymbol) {
     OnBarReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   // Fail bar.
   BarR->failMaterialization();
@@ -742,8 +766,9 @@ TEST_F(CoreAPIsStandardTest, FailAfterMaterialization) {
     OnFooReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnFooReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnFooReady),
+            NoDependenciesToRegister);
 
   bool OnBarReadyRun = false;
   auto OnBarReady = [&](Expected<SymbolMap> Result) {
@@ -751,8 +776,9 @@ TEST_F(CoreAPIsStandardTest, FailAfterMaterialization) {
     OnBarReadyRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnBarReady), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnBarReady),
+            NoDependenciesToRegister);
 
   // Add a dependency by Foo on Bar and vice-versa.
   FooR->addDependenciesForAll({{&JD, SymbolNameSet({Bar})}});
@@ -854,8 +880,6 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
   cantFail(JD.define(MU));
   cantFail(JD.define(absoluteSymbols({{Bar, BarSym}})));
 
-  SymbolNameSet Names({Foo});
-
   bool OnCompletionRun = false;
 
   auto OnCompletion = [&](Expected<SymbolMap> Result) {
@@ -867,8 +891,9 @@ TEST_F(CoreAPIsStandardTest, AddAndMaterializeLazySymbol) {
     OnCompletionRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), Names, SymbolState::Ready,
-            std::move(OnCompletion), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Ready, std::move(OnCompletion),
+            NoDependenciesToRegister);
 
   EXPECT_TRUE(FooMaterialized) << "Foo was not materialized";
   EXPECT_TRUE(BarDiscarded) << "Bar was not discarded";
@@ -910,8 +935,9 @@ TEST_F(CoreAPIsStandardTest, TestBasicWeakSymbolMaterialization) {
     OnCompletionRun = true;
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Bar}, SymbolState::Ready,
-            std::move(OnCompletion), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Bar), SymbolState::Ready, std::move(OnCompletion),
+            NoDependenciesToRegister);
 
   EXPECT_TRUE(OnCompletionRun) << "OnCompletion not run";
   EXPECT_TRUE(BarMaterialized) << "Bar was not materialized at all";
@@ -938,13 +964,13 @@ TEST_F(CoreAPIsStandardTest, DefineMaterializingSymbol) {
       });
 
   cantFail(JD.define(MU));
-  cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
+  cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Foo));
 
   // Assert that materialization is complete by now.
   ExpectNoMoreMaterialization = true;
 
   // Look up bar to verify that no further materialization happens.
-  auto BarResult = cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
+  auto BarResult = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Bar));
   EXPECT_EQ(BarResult.getAddress(), BarSym.getAddress())
       << "Expected Bar == BarSym";
 }
@@ -955,19 +981,19 @@ TEST_F(CoreAPIsStandardTest, GeneratorTest) {
   class TestGenerator : public JITDylib::DefinitionGenerator {
   public:
     TestGenerator(SymbolMap Symbols) : Symbols(std::move(Symbols)) {}
-    Expected<SymbolNameSet> tryToGenerate(JITDylib &JD,
-                                          const SymbolNameSet &Names) {
+    Error tryToGenerate(LookupKind K, JITDylib &JD,
+                        JITDylibLookupFlags JDLookupFlags,
+                        const SymbolLookupSet &Names) {
       SymbolMap NewDefs;
-      SymbolNameSet NewNames;
 
-      for (auto &Name : Names) {
-        if (Symbols.count(Name)) {
+      for (const auto &KV : Names) {
+        const auto &Name = KV.first;
+        if (Symbols.count(Name))
           NewDefs[Name] = Symbols[Name];
-          NewNames.insert(Name);
-        }
       }
+
       cantFail(JD.define(absoluteSymbols(std::move(NewDefs))));
-      return NewNames;
+      return Error::success();
     };
 
   private:
@@ -976,8 +1002,8 @@ TEST_F(CoreAPIsStandardTest, GeneratorTest) {
 
   JD.addGenerator(std::make_unique<TestGenerator>(SymbolMap({{Bar, BarSym}})));
 
-  auto Result =
-      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar}));
+  auto Result = cantFail(
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Foo, Bar})));
 
   EXPECT_EQ(Result.count(Bar), 1U) << "Expected to find fallback def for 'bar'";
   EXPECT_EQ(Result[Bar].getAddress(), BarSym.getAddress())
@@ -995,7 +1021,7 @@ TEST_F(CoreAPIsStandardTest, FailResolution) {
   cantFail(JD.define(MU));
 
   SymbolNameSet Names({Foo, Bar});
-  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), Names);
+  auto Result = ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet(Names));
 
   EXPECT_FALSE(!!Result) << "Expected failure";
   if (!Result) {
@@ -1028,8 +1054,8 @@ TEST_F(CoreAPIsStandardTest, FailEmissionAfterResolution) {
         cantFail(R.notifyResolved(SymbolMap({{Foo, FooSym}, {Bar, BarSym}})));
 
         ES.lookup(
-            JITDylibSearchList({{&JD, false}}), SymbolNameSet({Baz}),
-            SymbolState::Resolved,
+            LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet({Baz}), SymbolState::Resolved,
             [&R](Expected<SymbolMap> Result) {
               // Called when "baz" is resolved. We don't actually depend
               // on or care about baz, but use it to trigger failure of
@@ -1046,8 +1072,8 @@ TEST_F(CoreAPIsStandardTest, FailEmissionAfterResolution) {
 
   cantFail(JD.define(MU));
 
-  SymbolNameSet Names({Foo, Bar});
-  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), Names);
+  auto Result =
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Foo, Bar}));
 
   EXPECT_THAT_EXPECTED(std::move(Result), Failed())
       << "Unexpected success while trying to test error propagation";
@@ -1066,8 +1092,8 @@ TEST_F(CoreAPIsStandardTest, FailAfterPartialResolution) {
 
   bool QueryHandlerRun = false;
   ES.lookup(
-      JITDylibSearchList({{&JD, false}}), SymbolNameSet({Foo, Bar}),
-      SymbolState::Resolved,
+      LookupKind::Static, makeJITDylibSearchOrder(&JD),
+      SymbolLookupSet({Foo, Bar}), SymbolState::Resolved,
       [&](Expected<SymbolMap> Result) {
         EXPECT_THAT_EXPECTED(std::move(Result), Failed())
             << "Expected query to fail";
@@ -1087,8 +1113,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithUnthreadedMaterialization) {
 
   cantFail(JD.define(MU));
 
-  auto FooLookupResult =
-      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
+  auto FooLookupResult = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -1108,8 +1133,7 @@ TEST_F(CoreAPIsStandardTest, TestLookupWithThreadedMaterialization) {
 
   cantFail(JD.define(absoluteSymbols({{Foo, FooSym}})));
 
-  auto FooLookupResult =
-      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
+  auto FooLookupResult = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Foo));
 
   EXPECT_EQ(FooLookupResult.getAddress(), FooSym.getAddress())
       << "lookup returned an incorrect address";
@@ -1157,16 +1181,14 @@ TEST_F(CoreAPIsStandardTest, TestGetRequestedSymbolsAndReplace) {
   EXPECT_FALSE(FooMaterialized) << "Foo should not be materialized yet";
   EXPECT_FALSE(BarMaterialized) << "Bar should not be materialized yet";
 
-  auto FooSymResult =
-      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Foo));
+  auto FooSymResult = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Foo));
   EXPECT_EQ(FooSymResult.getAddress(), FooSym.getAddress())
       << "Address mismatch for Foo";
 
   EXPECT_TRUE(FooMaterialized) << "Foo should be materialized now";
   EXPECT_FALSE(BarMaterialized) << "Bar still should not be materialized";
 
-  auto BarSymResult =
-      cantFail(ES.lookup(JITDylibSearchList({{&JD, false}}), Bar));
+  auto BarSymResult = cantFail(ES.lookup(makeJITDylibSearchOrder(&JD), Bar));
   EXPECT_EQ(BarSymResult.getAddress(), BarSym.getAddress())
       << "Address mismatch for Bar";
   EXPECT_TRUE(BarMaterialized) << "Bar should be materialized now";
@@ -1186,7 +1208,8 @@ TEST_F(CoreAPIsStandardTest, TestMaterializationResponsibilityDelegation) {
 
   cantFail(JD.define(MU));
 
-  auto Result = ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo, Bar});
+  auto Result =
+      ES.lookup(makeJITDylibSearchOrder(&JD), SymbolLookupSet({Foo, Bar}));
 
   EXPECT_TRUE(!!Result) << "Result should be a success value";
   EXPECT_EQ(Result->count(Foo), 1U) << "\"Foo\" entry missing";
@@ -1216,8 +1239,9 @@ TEST_F(CoreAPIsStandardTest, TestMaterializeWeakSymbol) {
     cantFail(std::move(Result));
   };
 
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Ready,
-            std::move(OnCompletion), NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet({Foo}), SymbolState::Ready, std::move(OnCompletion),
+            NoDependenciesToRegister);
 
   auto MU2 = std::make_unique<SimpleMaterializationUnit>(
       SymbolFlagsMap({{Foo, JITSymbolFlags::Exported}}),
diff --git a/llvm/unittests/ExecutionEngine/Orc/LegacyAPIInteropTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LegacyAPIInteropTest.cpp
index f79d721b812c0..7b6d4b078fb9f 100644
--- a/llvm/unittests/ExecutionEngine/Orc/LegacyAPIInteropTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/LegacyAPIInteropTest.cpp
@@ -24,7 +24,9 @@ TEST_F(LegacyAPIsStandardTest, TestLambdaSymbolResolver) {
 
   auto Resolver = createSymbolResolver(
       [&](const SymbolNameSet &Symbols) {
-        auto FlagsMap = cantFail(JD.lookupFlags(Symbols));
+        auto FlagsMap = cantFail(JD.lookupFlags(
+            LookupKind::Static, JITDylibLookupFlags::MatchExportedSymbolsOnly,
+            SymbolLookupSet(Symbols)));
         SymbolNameSet Result;
         for (auto &KV : FlagsMap)
           if (!KV.second.isStrong())
@@ -57,7 +59,7 @@ TEST_F(LegacyAPIsStandardTest, TestLambdaSymbolResolver) {
   };
 
   auto Q = std::make_shared<AsynchronousSymbolQuery>(
-      SymbolNameSet({Foo, Bar}), SymbolState::Resolved, OnCompletion);
+      SymbolLookupSet({Foo, Bar}), SymbolState::Resolved, OnCompletion);
   auto Unresolved =
       Resolver->lookup(std::move(Q), SymbolNameSet({Foo, Bar, Baz}));
 
@@ -111,7 +113,8 @@ TEST_F(LegacyAPIsStandardTest, LegacyLookupHelpersFn) {
         << "Wrong flags for bar";
   };
 
-  AsynchronousSymbolQuery Q({Foo, Bar}, SymbolState::Resolved, OnCompletion);
+  AsynchronousSymbolQuery Q(SymbolLookupSet({Foo, Bar}), SymbolState::Resolved,
+                            OnCompletion);
   auto Unresolved =
       lookupWithLegacyFn(ES, Q, SymbolNameSet({Foo, Bar, Baz}), LegacyLookup);
 
diff --git a/llvm/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index ecb8cf653937f..f1c0da6a9abb6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -63,8 +63,9 @@ static bool testSetProcessAllSections(std::unique_ptr<MemoryBuffer> Obj,
 
   ObjLayer.setProcessAllSections(ProcessAllSections);
   cantFail(ObjLayer.add(JD, std::move(Obj), ES.allocateVModule()));
-  ES.lookup(JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Resolved,
-            OnResolveDoNothing, NoDependenciesToRegister);
+  ES.lookup(LookupKind::Static, makeJITDylibSearchOrder(&JD),
+            SymbolLookupSet(Foo), SymbolState::Resolved, OnResolveDoNothing,
+            NoDependenciesToRegister);
 
   return DebugSectionSeen;
 }
@@ -160,7 +161,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestOverrideObjectFlags) {
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
   ES.lookup(
-      JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Resolved,
+      LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo),
+      SymbolState::Resolved,
       [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
       NoDependenciesToRegister);
 }
@@ -225,7 +227,8 @@ TEST(RTDyldObjectLinkingLayerTest, TestAutoClaimResponsibilityForSymbols) {
 
   cantFail(CompileLayer.add(JD, std::move(M), ES.allocateVModule()));
   ES.lookup(
-      JITDylibSearchList({{&JD, false}}), {Foo}, SymbolState::Resolved,
+      LookupKind::Static, makeJITDylibSearchOrder(&JD), SymbolLookupSet(Foo),
+      SymbolState::Resolved,
       [](Expected<SymbolMap> R) { cantFail(std::move(R)); },
       NoDependenciesToRegister);
 }
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index d7712fda12707..814da64c7852e 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -183,6 +183,8 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   // See if we get constrained intrinsics instead of non-constrained
   // instructions.
   Builder.setIsFPConstrained(true);
+  auto Parent = BB->getParent();
+  Parent->addFnAttr(Attribute::StrictFP);
 
   V = Builder.CreateFAdd(V, V);
   ASSERT_TRUE(isa<IntrinsicInst>(V));
@@ -233,7 +235,8 @@ TEST_F(IRBuilderTest, ConstrainedFP) {
   AttributeSet CallAttrs = II->getAttributes().getFnAttributes();
   EXPECT_EQ(CallAttrs.hasAttribute(Attribute::StrictFP), true);
 
-  // Verify attributes on the containing function are created automatically.
+  // Verify attributes on the containing function are created when requested.
+  Builder.setConstrainedFPFunctionAttr();
   AttributeList Attrs = BB->getParent()->getAttributes();
   AttributeSet FnAttrs = Attrs.getFnAttributes();
   EXPECT_EQ(FnAttrs.hasAttribute(Attribute::StrictFP), true);
diff --git a/llvm/unittests/IR/MetadataTest.cpp b/llvm/unittests/IR/MetadataTest.cpp
index e6c7a50113957..424aaa7bf0c9e 100644
--- a/llvm/unittests/IR/MetadataTest.cpp
+++ b/llvm/unittests/IR/MetadataTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
@@ -2394,6 +2395,49 @@ TEST_F(DIExpressionTest, isValid) {
 #undef EXPECT_INVALID
 }
 
+TEST_F(DIExpressionTest, createFragmentExpression) {
+#define EXPECT_VALID_FRAGMENT(Offset, Size, ...)                               \
+  do {                                                                         \
+    uint64_t Elements[] = {__VA_ARGS__};                                       \
+    DIExpression* Expression = DIExpression::get(Context, Elements);           \
+    EXPECT_TRUE(DIExpression::createFragmentExpression(                        \
+      Expression, Offset, Size).hasValue());                                   \
+  } while (false)
+#define EXPECT_INVALID_FRAGMENT(Offset, Size, ...)                             \
+  do {                                                                         \
+    uint64_t Elements[] = {__VA_ARGS__};                                       \
+    DIExpression* Expression = DIExpression::get(Context, Elements);           \
+    EXPECT_FALSE(DIExpression::createFragmentExpression(                       \
+      Expression, Offset, Size).hasValue());                                   \
+  } while (false)
+
+  // createFragmentExpression adds correct ops.
+  Optional<DIExpression*> R = DIExpression::createFragmentExpression(
+    DIExpression::get(Context, {}), 0, 32);
+  EXPECT_EQ(R.hasValue(), true);
+  EXPECT_EQ(3u, (*R)->getNumElements());
+  EXPECT_EQ(dwarf::DW_OP_LLVM_fragment, (*R)->getElement(0));
+  EXPECT_EQ(0u, (*R)->getElement(1));
+  EXPECT_EQ(32u, (*R)->getElement(2));
+
+  // Valid fragment expressions.
+  EXPECT_VALID_FRAGMENT(0, 32, {});
+  EXPECT_VALID_FRAGMENT(0, 32, dwarf::DW_OP_deref);
+  EXPECT_VALID_FRAGMENT(0, 32, dwarf::DW_OP_LLVM_fragment, 0, 32);
+  EXPECT_VALID_FRAGMENT(16, 16, dwarf::DW_OP_LLVM_fragment, 0, 32);
+
+  // Invalid fragment expressions (incompatible ops).
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_constu, 6, dwarf::DW_OP_plus);
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_constu, 14, dwarf::DW_OP_minus);
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_constu, 16, dwarf::DW_OP_shr);
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_constu, 16, dwarf::DW_OP_shl);
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_constu, 16, dwarf::DW_OP_shra);
+  EXPECT_INVALID_FRAGMENT(0, 32, dwarf::DW_OP_plus_uconst, 6);
+
+#undef EXPECT_VALID_FRAGMENT
+#undef EXPECT_INVALID_FRAGMENT
+}
+
 typedef MetadataTest DIObjCPropertyTest;
 
 TEST_F(DIObjCPropertyTest, get) {
@@ -2854,4 +2898,41 @@ TEST_F(DistinctMDOperandPlaceholderTest, TrackingMDRefAndDistinctMDNode) {
 }
 #endif
 
+typedef MetadataTest DebugVariableTest;
+TEST_F(DebugVariableTest, DenseMap) {
+  DenseMap<DebugVariable, uint64_t> DebugVariableMap;
+
+  DILocalScope *Scope = getSubprogram();
+  DIFile *File = getFile();
+  DIType *Type = getDerivedType();
+  DINode::DIFlags Flags = static_cast<DINode::DIFlags>(7);
+
+  DILocation *InlinedLoc = DILocation::get(Context, 2, 7, Scope);
+
+  DILocalVariable *VarA =
+      DILocalVariable::get(Context, Scope, "A", File, 5, Type, 2, Flags, 8);
+  DILocalVariable *VarB =
+      DILocalVariable::get(Context, Scope, "B", File, 7, Type, 3, Flags, 8);
+
+  DebugVariable DebugVariableA(VarA, NoneType(), nullptr);
+  DebugVariable DebugVariableInlineA(VarA, NoneType(), InlinedLoc);
+  DebugVariable DebugVariableB(VarB, NoneType(), nullptr);
+  DebugVariable DebugVariableFragB(VarB, {{16, 16}}, nullptr);
+
+  DebugVariableMap.insert({DebugVariableA, 2});
+  DebugVariableMap.insert({DebugVariableInlineA, 3});
+  DebugVariableMap.insert({DebugVariableB, 6});
+  DebugVariableMap.insert({DebugVariableFragB, 12});
+
+  EXPECT_EQ(DebugVariableMap.count(DebugVariableA), 1u);
+  EXPECT_EQ(DebugVariableMap.count(DebugVariableInlineA), 1u);
+  EXPECT_EQ(DebugVariableMap.count(DebugVariableB), 1u);
+  EXPECT_EQ(DebugVariableMap.count(DebugVariableFragB), 1u);
+
+  EXPECT_EQ(DebugVariableMap.find(DebugVariableA)->second, 2u);
+  EXPECT_EQ(DebugVariableMap.find(DebugVariableInlineA)->second, 3u);
+  EXPECT_EQ(DebugVariableMap.find(DebugVariableB)->second, 6u);
+  EXPECT_EQ(DebugVariableMap.find(DebugVariableFragB)->second, 12u);
+}
+
 } // end namespace
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index f624024d584ca..c987b7b813e7f 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -182,6 +182,29 @@ TEST_F(PatternMatchTest, SpecificIntUGT) {
           .match(NegOne));
 }
 
+TEST_F(PatternMatchTest, SignbitZeroChecks) {
+  Type *IntTy = IRB.getInt32Ty();
+
+  Value *Zero = ConstantInt::get(IntTy, 0);
+  Value *One = ConstantInt::get(IntTy, 1);
+  Value *NegOne = ConstantInt::get(IntTy, -1);
+
+  EXPECT_TRUE(m_Negative().match(NegOne));
+  EXPECT_FALSE(m_NonNegative().match(NegOne));
+  EXPECT_FALSE(m_StrictlyPositive().match(NegOne));
+  EXPECT_TRUE(m_NonPositive().match(NegOne));
+
+  EXPECT_FALSE(m_Negative().match(Zero));
+  EXPECT_TRUE(m_NonNegative().match(Zero));
+  EXPECT_FALSE(m_StrictlyPositive().match(Zero));
+  EXPECT_TRUE(m_NonPositive().match(Zero));
+
+  EXPECT_FALSE(m_Negative().match(One));
+  EXPECT_TRUE(m_NonNegative().match(One));
+  EXPECT_TRUE(m_StrictlyPositive().match(One));
+  EXPECT_FALSE(m_NonPositive().match(One));
+}
+
 TEST_F(PatternMatchTest, SpecificIntUGE) {
   Type *IntTy = IRB.getInt32Ty();
   unsigned BitWidth = IntTy->getScalarSizeInBits();
@@ -1133,6 +1156,85 @@ TEST_F(PatternMatchTest, WithOverflowInst) {
   EXPECT_EQ(Add, WOI);
 }
 
+TEST_F(PatternMatchTest, IntrinsicMatcher) {
+  Value *Name = IRB.CreateAlloca(IRB.getInt8Ty());
+  Value *Hash = IRB.getInt64(0);
+  Value *Num = IRB.getInt32(1);
+  Value *Index = IRB.getInt32(2);
+  Value *Step = IRB.getInt64(3);
+
+  Value *Ops[] = {Name, Hash, Num, Index, Step};
+  Module *M = BB->getParent()->getParent();
+  Function *TheFn =
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step);
+
+  Value *Intrinsic5 = CallInst::Create(TheFn, Ops, "", BB);
+
+  // Match without capturing.
+  EXPECT_TRUE(match(
+      Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                      m_Value(), m_Value(), m_Value(), m_Value(), m_Value())));
+  EXPECT_FALSE(match(
+      Intrinsic5, m_Intrinsic<Intrinsic::memmove>(
+                      m_Value(), m_Value(), m_Value(), m_Value(), m_Value())));
+
+  // Match with capturing.
+  Value *Arg1 = nullptr;
+  Value *Arg2 = nullptr;
+  Value *Arg3 = nullptr;
+  Value *Arg4 = nullptr;
+  Value *Arg5 = nullptr;
+  EXPECT_TRUE(
+      match(Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                            m_Value(Arg1), m_Value(Arg2), m_Value(Arg3),
+                            m_Value(Arg4), m_Value(Arg5))));
+  EXPECT_EQ(Arg1, Name);
+  EXPECT_EQ(Arg2, Hash);
+  EXPECT_EQ(Arg3, Num);
+  EXPECT_EQ(Arg4, Index);
+  EXPECT_EQ(Arg5, Step);
+
+  // Match specific second argument.
+  EXPECT_TRUE(
+      match(Intrinsic5,
+            m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                m_Value(), m_SpecificInt(0), m_Value(), m_Value(), m_Value())));
+  EXPECT_FALSE(
+      match(Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                            m_Value(), m_SpecificInt(10), m_Value(), m_Value(),
+                            m_Value())));
+
+  // Match specific third argument.
+  EXPECT_TRUE(
+      match(Intrinsic5,
+            m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                m_Value(), m_Value(), m_SpecificInt(1), m_Value(), m_Value())));
+  EXPECT_FALSE(
+      match(Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                            m_Value(), m_Value(), m_SpecificInt(10), m_Value(),
+                            m_Value())));
+
+  // Match specific fourth argument.
+  EXPECT_TRUE(
+      match(Intrinsic5,
+            m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                m_Value(), m_Value(), m_Value(), m_SpecificInt(2), m_Value())));
+  EXPECT_FALSE(
+      match(Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                            m_Value(), m_Value(), m_Value(), m_SpecificInt(10),
+                            m_Value())));
+
+  // Match specific fifth argument.
+  EXPECT_TRUE(
+      match(Intrinsic5,
+            m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                m_Value(), m_Value(), m_Value(), m_Value(), m_SpecificInt(3))));
+  EXPECT_FALSE(
+      match(Intrinsic5, m_Intrinsic<Intrinsic::instrprof_increment_step>(
+                            m_Value(), m_Value(), m_Value(), m_Value(),
+                            m_SpecificInt(10))));
+}
+
 template <typename T> struct MutableConstTest : PatternMatchTest { };
 
 typedef ::testing::Types<std::tuple<Value*, Instruction*>,
diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index 1948413070ae3..694fec20a0845 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -9,23 +9,33 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <fstream>
 #include <stdlib.h>
 #include <string>
+#include <tuple>
 
 using namespace llvm;
 
 namespace {
 
+MATCHER(StringEquality, "Checks if two char* are equal as strings") {
+  return std::string(std::get<0>(arg)) == std::string(std::get<1>(arg));
+}
+
 class TempEnvVar {
  public:
   TempEnvVar(const char *name, const char *value)
@@ -778,119 +788,99 @@ TEST(CommandLineTest, ResponseFileWindows) {
 }
 
 TEST(CommandLineTest, ResponseFiles) {
-  llvm::SmallString<128> TestDir;
-  std::error_code EC =
-    llvm::sys::fs::createUniqueDirectory("unittest", TestDir);
-  EXPECT_TRUE(!EC);
+  vfs::InMemoryFileSystem FS;
+#ifdef _WIN32
+  const char *TestRoot = "C:\\";
+#else
+  const char *TestRoot = "/";
+#endif
+  FS.setCurrentWorkingDirectory(TestRoot);
 
   // Create included response file of first level.
-  llvm::SmallString<128> IncludedFileName;
-  llvm::sys::path::append(IncludedFileName, TestDir, "resp1");
-  std::ofstream IncludedFile(IncludedFileName.c_str());
-  EXPECT_TRUE(IncludedFile.is_open());
-  IncludedFile << "-option_1 -option_2\n"
-                  "@incdir/resp2\n"
-                  "-option_3=abcd\n"
-                  "@incdir/resp3\n"
-                  "-option_4=efjk\n";
-  IncludedFile.close();
+  llvm::StringRef IncludedFileName = "resp1";
+  FS.addFile(IncludedFileName, 0,
+             llvm::MemoryBuffer::getMemBuffer("-option_1 -option_2\n"
+                                              "@incdir/resp2\n"
+                                              "-option_3=abcd\n"
+                                              "@incdir/resp3\n"
+                                              "-option_4=efjk\n"));
 
   // Directory for included file.
-  llvm::SmallString<128> IncDir;
-  llvm::sys::path::append(IncDir, TestDir, "incdir");
-  EC = llvm::sys::fs::create_directory(IncDir);
-  EXPECT_TRUE(!EC);
+  llvm::StringRef IncDir = "incdir";
 
   // Create included response file of second level.
   llvm::SmallString<128> IncludedFileName2;
   llvm::sys::path::append(IncludedFileName2, IncDir, "resp2");
-  std::ofstream IncludedFile2(IncludedFileName2.c_str());
-  EXPECT_TRUE(IncludedFile2.is_open());
-  IncludedFile2 << "-option_21 -option_22\n";
-  IncludedFile2 << "-option_23=abcd\n";
-  IncludedFile2.close();
+  FS.addFile(IncludedFileName2, 0,
+             MemoryBuffer::getMemBuffer("-option_21 -option_22\n"
+                                        "-option_23=abcd\n"));
 
   // Create second included response file of second level.
   llvm::SmallString<128> IncludedFileName3;
   llvm::sys::path::append(IncludedFileName3, IncDir, "resp3");
-  std::ofstream IncludedFile3(IncludedFileName3.c_str());
-  EXPECT_TRUE(IncludedFile3.is_open());
-  IncludedFile3 << "-option_31 -option_32\n";
-  IncludedFile3 << "-option_33=abcd\n";
-  IncludedFile3.close();
+  FS.addFile(IncludedFileName3, 0,
+             MemoryBuffer::getMemBuffer("-option_31 -option_32\n"
+                                        "-option_33=abcd\n"));
 
   // Prepare 'file' with reference to response file.
   SmallString<128> IncRef;
   IncRef.append(1, '@');
-  IncRef.append(IncludedFileName.c_str());
-  llvm::SmallVector<const char *, 4> Argv =
-                          { "test/test", "-flag_1", IncRef.c_str(), "-flag_2" };
+  IncRef.append(IncludedFileName);
+  llvm::SmallVector<const char *, 4> Argv = {"test/test", "-flag_1",
+                                             IncRef.c_str(), "-flag_2"};
 
   // Expand response files.
   llvm::BumpPtrAllocator A;
   llvm::StringSaver Saver(A);
-  bool Res = llvm::cl::ExpandResponseFiles(
-                    Saver, llvm::cl::TokenizeGNUCommandLine, Argv, false, true);
-  EXPECT_TRUE(Res);
-  EXPECT_EQ(Argv.size(), 13U);
-  EXPECT_STREQ(Argv[0], "test/test");
-  EXPECT_STREQ(Argv[1], "-flag_1");
-  EXPECT_STREQ(Argv[2], "-option_1");
-  EXPECT_STREQ(Argv[3], "-option_2");
-  EXPECT_STREQ(Argv[4], "-option_21");
-  EXPECT_STREQ(Argv[5], "-option_22");
-  EXPECT_STREQ(Argv[6], "-option_23=abcd");
-  EXPECT_STREQ(Argv[7], "-option_3=abcd");
-  EXPECT_STREQ(Argv[8], "-option_31");
-  EXPECT_STREQ(Argv[9], "-option_32");
-  EXPECT_STREQ(Argv[10], "-option_33=abcd");
-  EXPECT_STREQ(Argv[11], "-option_4=efjk");
-  EXPECT_STREQ(Argv[12], "-flag_2");
-
-  llvm::sys::fs::remove(IncludedFileName3);
-  llvm::sys::fs::remove(IncludedFileName2);
-  llvm::sys::fs::remove(IncDir);
-  llvm::sys::fs::remove(IncludedFileName);
-  llvm::sys::fs::remove(TestDir);
+  ASSERT_TRUE(llvm::cl::ExpandResponseFiles(
+      Saver, llvm::cl::TokenizeGNUCommandLine, Argv, false, true, FS,
+      /*CurrentDir=*/StringRef(TestRoot)));
+  EXPECT_THAT(Argv, testing::Pointwise(
+                        StringEquality(),
+                        {"test/test", "-flag_1", "-option_1", "-option_2",
+                         "-option_21", "-option_22", "-option_23=abcd",
+                         "-option_3=abcd", "-option_31", "-option_32",
+                         "-option_33=abcd", "-option_4=efjk", "-flag_2"}));
 }
 
 TEST(CommandLineTest, RecursiveResponseFiles) {
-  SmallString<128> TestDir;
-  std::error_code EC = sys::fs::createUniqueDirectory("unittest", TestDir);
-  EXPECT_TRUE(!EC);
+  vfs::InMemoryFileSystem FS;
+#ifdef _WIN32
+  const char *TestRoot = "C:\\";
+#else
+  const char *TestRoot = "/";
+#endif
+  FS.setCurrentWorkingDirectory(TestRoot);
 
-  SmallString<128> SelfFilePath;
-  sys::path::append(SelfFilePath, TestDir, "self.rsp");
-  std::string SelfFileRef = std::string("@") + SelfFilePath.c_str();
+  StringRef SelfFilePath = "self.rsp";
+  std::string SelfFileRef = ("@" + SelfFilePath).str();
 
-  SmallString<128> NestedFilePath;
-  sys::path::append(NestedFilePath, TestDir, "nested.rsp");
-  std::string NestedFileRef = std::string("@") + NestedFilePath.c_str();
+  StringRef NestedFilePath = "nested.rsp";
+  std::string NestedFileRef = ("@" + NestedFilePath).str();
 
-  SmallString<128> FlagFilePath;
-  sys::path::append(FlagFilePath, TestDir, "flag.rsp");
-  std::string FlagFileRef = std::string("@") + FlagFilePath.c_str();
+  StringRef FlagFilePath = "flag.rsp";
+  std::string FlagFileRef = ("@" + FlagFilePath).str();
 
-  std::ofstream SelfFile(SelfFilePath.str());
-  EXPECT_TRUE(SelfFile.is_open());
+  std::string SelfFileContents;
+  raw_string_ostream SelfFile(SelfFileContents);
   SelfFile << "-option_1\n";
   SelfFile << FlagFileRef << "\n";
   SelfFile << NestedFileRef << "\n";
   SelfFile << SelfFileRef << "\n";
-  SelfFile.close();
+  FS.addFile(SelfFilePath, 0, MemoryBuffer::getMemBuffer(SelfFile.str()));
 
-  std::ofstream NestedFile(NestedFilePath.str());
-  EXPECT_TRUE(NestedFile.is_open());
+  std::string NestedFileContents;
+  raw_string_ostream NestedFile(NestedFileContents);
   NestedFile << "-option_2\n";
   NestedFile << FlagFileRef << "\n";
   NestedFile << SelfFileRef << "\n";
   NestedFile << NestedFileRef << "\n";
-  NestedFile.close();
+  FS.addFile(NestedFilePath, 0, MemoryBuffer::getMemBuffer(NestedFile.str()));
 
-  std::ofstream FlagFile(FlagFilePath.str());
-  EXPECT_TRUE(FlagFile.is_open());
+  std::string FlagFileContents;
+  raw_string_ostream FlagFile(FlagFileContents);
   FlagFile << "-option_x\n";
-  FlagFile.close();
+  FS.addFile(FlagFilePath, 0, MemoryBuffer::getMemBuffer(FlagFile.str()));
 
   // Ensure:
   // Recursive expansion terminates
@@ -905,47 +895,48 @@ TEST(CommandLineTest, RecursiveResponseFiles) {
 #else
   cl::TokenizerCallback Tokenizer = cl::TokenizeGNUCommandLine;
 #endif
-  bool Res = cl::ExpandResponseFiles(Saver, Tokenizer, Argv, false, false);
-  EXPECT_FALSE(Res);
-
-  EXPECT_EQ(Argv.size(), 9U);
-  EXPECT_STREQ(Argv[0], "test/test");
-  EXPECT_STREQ(Argv[1], "-option_1");
-  EXPECT_STREQ(Argv[2], "-option_x");
-  EXPECT_STREQ(Argv[3], "-option_2");
-  EXPECT_STREQ(Argv[4], "-option_x");
-  EXPECT_STREQ(Argv[5], SelfFileRef.c_str());
-  EXPECT_STREQ(Argv[6], NestedFileRef.c_str());
-  EXPECT_STREQ(Argv[7], SelfFileRef.c_str());
-  EXPECT_STREQ(Argv[8], "-option_3");
+  ASSERT_FALSE(
+      cl::ExpandResponseFiles(Saver, Tokenizer, Argv, false, false, FS,
+                              /*CurrentDir=*/llvm::StringRef(TestRoot)));
+
+  EXPECT_THAT(Argv,
+              testing::Pointwise(StringEquality(),
+                                 {"test/test", "-option_1", "-option_x",
+                                  "-option_2", "-option_x", SelfFileRef.c_str(),
+                                  NestedFileRef.c_str(), SelfFileRef.c_str(),
+                                  "-option_3"}));
 }
 
 TEST(CommandLineTest, ResponseFilesAtArguments) {
-  SmallString<128> TestDir;
-  std::error_code EC = sys::fs::createUniqueDirectory("unittest", TestDir);
-  EXPECT_TRUE(!EC);
+  vfs::InMemoryFileSystem FS;
+#ifdef _WIN32
+  const char *TestRoot = "C:\\";
+#else
+  const char *TestRoot = "/";
+#endif
+  FS.setCurrentWorkingDirectory(TestRoot);
 
-  SmallString<128> ResponseFilePath;
-  sys::path::append(ResponseFilePath, TestDir, "test.rsp");
+  StringRef ResponseFilePath = "test.rsp";
 
-  std::ofstream ResponseFile(ResponseFilePath.c_str());
-  EXPECT_TRUE(ResponseFile.is_open());
+  std::string ResponseFileContents;
+  raw_string_ostream ResponseFile(ResponseFileContents);
   ResponseFile << "-foo" << "\n";
   ResponseFile << "-bar" << "\n";
-  ResponseFile.close();
+  FS.addFile(ResponseFilePath, 0,
+             MemoryBuffer::getMemBuffer(ResponseFile.str()));
 
   // Ensure we expand rsp files after lots of non-rsp arguments starting with @.
   constexpr size_t NON_RSP_AT_ARGS = 64;
   SmallVector<const char *, 4> Argv = {"test/test"};
   Argv.append(NON_RSP_AT_ARGS, "@non_rsp_at_arg");
-  std::string ResponseFileRef = std::string("@") + ResponseFilePath.c_str();
+  std::string ResponseFileRef = ("@" + ResponseFilePath).str();
   Argv.push_back(ResponseFileRef.c_str());
 
   BumpPtrAllocator A;
   StringSaver Saver(A);
-  bool Res = cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv,
-                                     false, false);
-  EXPECT_FALSE(Res);
+  ASSERT_FALSE(cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv,
+                                       false, false, FS,
+                                       /*CurrentDir=*/StringRef(TestRoot)));
 
   // ASSERT instead of EXPECT to prevent potential out-of-bounds access.
   ASSERT_EQ(Argv.size(), 1 + NON_RSP_AT_ARGS + 2);
@@ -957,6 +948,34 @@ TEST(CommandLineTest, ResponseFilesAtArguments) {
   EXPECT_STREQ(Argv[i++], "-bar");
 }
 
+TEST(CommandLineTest, ResponseFileRelativePath) {
+  vfs::InMemoryFileSystem FS;
+#ifdef _WIN32
+  const char *TestRoot = "C:\\";
+#else
+  const char *TestRoot = "//net";
+#endif
+  FS.setCurrentWorkingDirectory(TestRoot);
+
+  StringRef OuterFile = "dir/outer.rsp";
+  StringRef OuterFileContents = "@inner.rsp";
+  FS.addFile(OuterFile, 0, MemoryBuffer::getMemBuffer(OuterFileContents));
+
+  StringRef InnerFile = "dir/inner.rsp";
+  StringRef InnerFileContents = "-flag";
+  FS.addFile(InnerFile, 0, MemoryBuffer::getMemBuffer(InnerFileContents));
+
+  SmallVector<const char *, 2> Argv = {"test/test", "@dir/outer.rsp"};
+
+  BumpPtrAllocator A;
+  StringSaver Saver(A);
+  ASSERT_TRUE(cl::ExpandResponseFiles(Saver, cl::TokenizeGNUCommandLine, Argv,
+                                      false, true, FS,
+                                      /*CurrentDir=*/StringRef(TestRoot)));
+  EXPECT_THAT(Argv,
+              testing::Pointwise(StringEquality(), {"test/test", "-flag"}));
+}
+
 TEST(CommandLineTest, SetDefautValue) {
   cl::ResetCommandLineParser();
 
diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp
index c2b3b127cf178..372521dc5c923 100644
--- a/llvm/unittests/Support/KnownBitsTest.cpp
+++ b/llvm/unittests/Support/KnownBitsTest.cpp
@@ -127,4 +127,18 @@ TEST(KnownBitsTest, AddSubExhaustive) {
   TestAddSubExhaustive(false);
 }
 
+TEST(KnownBitsTest, GetMinMaxVal) {
+  unsigned Bits = 4;
+  ForeachKnownBits(Bits, [&](const KnownBits &Known) {
+    APInt Min = APInt::getMaxValue(Bits);
+    APInt Max = APInt::getMinValue(Bits);
+    ForeachNumInKnownBits(Known, [&](const APInt &N) {
+      Min = APIntOps::umin(Min, N);
+      Max = APIntOps::umax(Max, N);
+    });
+    EXPECT_EQ(Min, Known.getMinValue());
+    EXPECT_EQ(Max, Known.getMaxValue());
+  });
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index 9de46a689cd72..b143ea6d20630 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -1230,7 +1230,9 @@ TEST(Support, RemoveDots) {
 TEST(Support, ReplacePathPrefix) {
   SmallString<64> Path1("/foo");
   SmallString<64> Path2("/old/foo");
+  SmallString<64> Path3("/oldnew/foo");
   SmallString<64> OldPrefix("/old");
+  SmallString<64> OldPrefixSep("/old/");
   SmallString<64> NewPrefix("/new");
   SmallString<64> NewPrefix2("/longernew");
   SmallString<64> EmptyPrefix("");
@@ -1250,6 +1252,39 @@ TEST(Support, ReplacePathPrefix) {
   Path = Path2;
   path::replace_path_prefix(Path, OldPrefix, EmptyPrefix);
   EXPECT_EQ(Path, "/foo");
+  Path = Path2;
+  path::replace_path_prefix(Path, OldPrefix, EmptyPrefix, path::Style::native,
+                            true);
+  EXPECT_EQ(Path, "foo");
+  Path = Path3;
+  path::replace_path_prefix(Path, OldPrefix, NewPrefix, path::Style::native,
+                            false);
+  EXPECT_EQ(Path, "/newnew/foo");
+  Path = Path3;
+  path::replace_path_prefix(Path, OldPrefix, NewPrefix, path::Style::native,
+                            true);
+  EXPECT_EQ(Path, "/oldnew/foo");
+  Path = Path3;
+  path::replace_path_prefix(Path, OldPrefixSep, NewPrefix, path::Style::native,
+                            true);
+  EXPECT_EQ(Path, "/oldnew/foo");
+  Path = Path1;
+  path::replace_path_prefix(Path, EmptyPrefix, NewPrefix);
+  EXPECT_EQ(Path, "/new/foo");
+  Path = OldPrefix;
+  path::replace_path_prefix(Path, OldPrefix, NewPrefix);
+  EXPECT_EQ(Path, "/new");
+  Path = OldPrefixSep;
+  path::replace_path_prefix(Path, OldPrefix, NewPrefix);
+  EXPECT_EQ(Path, "/new/");
+  Path = OldPrefix;
+  path::replace_path_prefix(Path, OldPrefixSep, NewPrefix, path::Style::native,
+                            false);
+  EXPECT_EQ(Path, "/old");
+  Path = OldPrefix;
+  path::replace_path_prefix(Path, OldPrefixSep, NewPrefix, path::Style::native,
+                            true);
+  EXPECT_EQ(Path, "/new");
 }
 
 TEST_F(FileSystemTest, OpenFileForRead) {
diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
index 8e491d9ef1d67..8ca864aa8c635 100644
--- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp
+++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp
@@ -250,9 +250,9 @@ TEST(MachineInstrValidTailPredication, IsCorrect) {
     case MVE_VMUL_qr_i8:
     case MVE_VMULf16:
     case MVE_VMULf32:
-    case MVE_VMULt1i16:
-    case MVE_VMULt1i8:
-    case MVE_VMULt1i32:
+    case MVE_VMULi16:
+    case MVE_VMULi8:
+    case MVE_VMULi32:
     case MVE_VMVN:
     case MVE_VMVNimmi16:
     case MVE_VMVNimmi32:
diff --git a/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp b/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
index 2d1ddf11afcb2..449d426d9b6ee 100644
--- a/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
+++ b/llvm/unittests/Target/X86/MachineSizeOptsTest.cpp
@@ -113,13 +113,13 @@ TEST_F(MachineSizeOptsTest, Test) {
   ASSERT_TRUE(iter == BB0.succ_end());
   MachineBasicBlock *BB3 = *BB1->succ_begin();
   ASSERT_TRUE(BB3 == *BB2->succ_begin());
-  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, MBFI_F));
-  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, MBFI_G));
-  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, MBFI_H));
-  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, MBFI_F));
-  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, MBFI_F));
-  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, MBFI_F));
-  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, MBFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, MBFI_F, PGSOQueryType::Test));
+  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, MBFI_G, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, MBFI_H, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, MBFI_F, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, MBFI_F, PGSOQueryType::Test));
+  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, MBFI_F, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, MBFI_F, PGSOQueryType::Test));
 }
 
 const char* MachineSizeOptsTest::MIRString = R"MIR(
diff --git a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
index 64c4f796cb819..887c9c9558212 100644
--- a/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CodeMoverUtilsTest.cpp
@@ -65,39 +65,38 @@ TEST(CodeMoverUtils, BasicTest) {
   //   }
   // }
   std::unique_ptr<Module> M = parseIR(
-      C,
-      "define void @foo(i32* noalias %A, i32* noalias %B, i32* noalias %C\n"
-      "                  , i64 %N) {\n"
-      "entry:\n"
-      "  %X = sdiv i64 1, %N\n"
-      "  call void @safecall()\n"
-      "  %cmp1 = icmp slt i64 0, %N\n"
-      "  call void @unsafecall1()\n"
-      "  call void @unsafecall2()\n"
-      "  br i1 %cmp1, label %for.body, label %for.end\n"
-      "for.body:\n"
-      "  %i = phi i64 [ 0, %entry ], [ %inc, %for.body ]\n"
-      "  %arrayidx_A5 = getelementptr inbounds i32, i32* %A, i64 5\n"
-      "  store i32 5, i32* %arrayidx_A5, align 4\n"
-      "  %arrayidx_A = getelementptr inbounds i32, i32* %A, i64 %i\n"
-      "  store i32 0, i32* %arrayidx_A, align 4\n"
-      "  %load1 = load i32, i32* %arrayidx_A, align 4\n"
-      "  %arrayidx_B = getelementptr inbounds i32, i32* %B, i64 %i\n"
-      "  store i32 %load1, i32* %arrayidx_B, align 4\n"
-      "  %load2 = load i32, i32* %arrayidx_A, align 4\n"
-      "  %arrayidx_C = getelementptr inbounds i32, i32* %C, i64 %i\n"
-      "  store i32 %load2, i32* %arrayidx_C, align 4\n"
-      "  %arrayidx_A6 = getelementptr inbounds i32, i32* %A, i64 6\n"
-      "  store i32 6, i32* %arrayidx_A6, align 4\n"
-      "  %inc = add nsw i64 %i, 1\n"
-      "  %cmp = icmp slt i64 %inc, %N\n"
-      "  br i1 %cmp, label %for.body, label %for.end\n"
-      "for.end:\n"
-      "  ret void\n"
-      "}\n"
-      "declare void @safecall() nounwind nosync willreturn\n"
-      "declare void @unsafecall1()\n"
-      "declare void @unsafecall2()\n");
+      C, "define void @foo(i32* noalias %A, i32* noalias %B, i32* noalias %C\n"
+         "                  , i64 %N) {\n"
+         "entry:\n"
+         "  %X = sdiv i64 1, %N\n"
+         "  call void @safecall()\n"
+         "  %cmp1 = icmp slt i64 0, %N\n"
+         "  call void @unsafecall1()\n"
+         "  call void @unsafecall2()\n"
+         "  br i1 %cmp1, label %for.body, label %for.end\n"
+         "for.body:\n"
+         "  %i = phi i64 [ 0, %entry ], [ %inc, %for.body ]\n"
+         "  %arrayidx_A5 = getelementptr inbounds i32, i32* %A, i64 5\n"
+         "  store i32 5, i32* %arrayidx_A5, align 4\n"
+         "  %arrayidx_A = getelementptr inbounds i32, i32* %A, i64 %i\n"
+         "  store i32 0, i32* %arrayidx_A, align 4\n"
+         "  %load1 = load i32, i32* %arrayidx_A, align 4\n"
+         "  %arrayidx_B = getelementptr inbounds i32, i32* %B, i64 %i\n"
+         "  store i32 %load1, i32* %arrayidx_B, align 4\n"
+         "  %load2 = load i32, i32* %arrayidx_A, align 4\n"
+         "  %arrayidx_C = getelementptr inbounds i32, i32* %C, i64 %i\n"
+         "  store i32 %load2, i32* %arrayidx_C, align 4\n"
+         "  %arrayidx_A6 = getelementptr inbounds i32, i32* %A, i64 6\n"
+         "  store i32 6, i32* %arrayidx_A6, align 4\n"
+         "  %inc = add nsw i64 %i, 1\n"
+         "  %cmp = icmp slt i64 %inc, %N\n"
+         "  br i1 %cmp, label %for.body, label %for.end\n"
+         "for.end:\n"
+         "  ret void\n"
+         "}\n"
+         "declare void @safecall() nounwind nosync willreturn\n"
+         "declare void @unsafecall1()\n"
+         "declare void @unsafecall2()\n");
 
   run(*M, "foo",
       [&](Function &F, DominatorTree &DT, PostDominatorTree &PDT,
@@ -106,9 +105,11 @@ TEST(CodeMoverUtils, BasicTest) {
         BasicBlock *Entry = &*(FI++);
         assert(Entry->getName() == "entry" && "Expecting BasicBlock entry");
         Instruction *CI_safecall = Entry->front().getNextNode();
-        assert(isa<CallInst>(CI_safecall) && "Expecting CI_safecall to be a CallInst");
+        assert(isa<CallInst>(CI_safecall) &&
+               "Expecting CI_safecall to be a CallInst");
         Instruction *CI_unsafecall = CI_safecall->getNextNode()->getNextNode();
-        assert(isa<CallInst>(CI_unsafecall) && "Expecting CI_unsafecall to be a CallInst");
+        assert(isa<CallInst>(CI_unsafecall) &&
+               "Expecting CI_unsafecall to be a CallInst");
         BasicBlock *ForBody = &*(FI++);
         assert(ForBody->getName() == "for.body" &&
                "Expecting BasicBlock for.body");
@@ -126,39 +127,48 @@ TEST(CodeMoverUtils, BasicTest) {
         assert(LI1->getName() == "load1" && "Expecting LI1 to be load1");
         Instruction *LI2 = LI1->getNextNode()->getNextNode()->getNextNode();
         assert(LI2->getName() == "load2" && "Expecting LI2 to be load2");
-        Instruction *SI_A6 = LI2->getNextNode()->getNextNode()->getNextNode()->getNextNode();
+        Instruction *SI_A6 =
+            LI2->getNextNode()->getNextNode()->getNextNode()->getNextNode();
         assert(isa<StoreInst>(SI_A6) &&
                SI_A6->getOperand(1)->getName() == "arrayidx_A6" &&
                "Expecting store to arrayidx_A6");
 
-        // Can move after CI_safecall, as it does not throw, not synchronize, or must return.
-        EXPECT_TRUE(isSafeToMoveBefore(*CI_safecall->getPrevNode(), *CI_safecall->getNextNode(), DT, PDT, DI));
+        // Can move after CI_safecall, as it does not throw, not synchronize, or
+        // must return.
+        EXPECT_TRUE(isSafeToMoveBefore(*CI_safecall->getPrevNode(),
+                                       *CI_safecall->getNextNode(), DT, PDT,
+                                       DI));
 
         // Cannot move CI_unsafecall, as it may throw.
-        EXPECT_FALSE(isSafeToMoveBefore(*CI_unsafecall->getNextNode(), *CI_unsafecall, DT, PDT, DI));
+        EXPECT_FALSE(isSafeToMoveBefore(*CI_unsafecall->getNextNode(),
+                                        *CI_unsafecall, DT, PDT, DI));
 
         // Moving instruction to non control flow equivalent places are not
         // supported.
-        EXPECT_FALSE(isSafeToMoveBefore(*SI_A5, *Entry->getTerminator(), DT, PDT, DI));
+        EXPECT_FALSE(
+            isSafeToMoveBefore(*SI_A5, *Entry->getTerminator(), DT, PDT, DI));
 
         // Moving PHINode is not supported.
-        EXPECT_FALSE(isSafeToMoveBefore(PN, *PN.getPrevNode(), DT, PDT, DI));
+        EXPECT_FALSE(isSafeToMoveBefore(PN, *PN.getNextNode()->getNextNode(),
+                                        DT, PDT, DI));
 
         // Cannot move non-PHINode before PHINode.
         EXPECT_FALSE(isSafeToMoveBefore(*PN.getNextNode(), PN, DT, PDT, DI));
 
         // Moving Terminator is not supported.
-        EXPECT_FALSE(isSafeToMoveBefore(*Entry->getTerminator(), *PN.getNextNode(), DT,
-                                PDT, DI));
+        EXPECT_FALSE(isSafeToMoveBefore(*Entry->getTerminator(),
+                                        *PN.getNextNode(), DT, PDT, DI));
 
         // Cannot move %arrayidx_A after SI, as SI is its user.
-        EXPECT_FALSE(isSafeToMoveBefore(*SI->getPrevNode(), *SI->getNextNode(), DT, PDT, DI));
+        EXPECT_FALSE(isSafeToMoveBefore(*SI->getPrevNode(), *SI->getNextNode(),
+                                        DT, PDT, DI));
 
         // Cannot move SI before %arrayidx_A, as %arrayidx_A is its operand.
         EXPECT_FALSE(isSafeToMoveBefore(*SI, *SI->getPrevNode(), DT, PDT, DI));
 
         // Cannot move LI2 after SI_A6, as there is a flow dependence.
-        EXPECT_FALSE(isSafeToMoveBefore(*LI2, *SI_A6->getNextNode(), DT, PDT, DI));
+        EXPECT_FALSE(
+            isSafeToMoveBefore(*LI2, *SI_A6->getNextNode(), DT, PDT, DI));
 
         // Cannot move SI after LI1, as there is a anti dependence.
         EXPECT_FALSE(isSafeToMoveBefore(*SI, *LI1->getNextNode(), DT, PDT, DI));
diff --git a/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp b/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp
index 55ca78635759b..7caa5ed319acb 100644
--- a/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SizeOptsTest.cpp
@@ -68,13 +68,13 @@ TEST_F(SizeOptsTest, Test) {
   BasicBlock *BB3 = BB1->getSingleSuccessor();
 
   EXPECT_TRUE(PSI.hasProfileSummary());
-  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, BFI_F));
-  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, BFI_G));
-  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, BFI_H));
-  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, BFI_F));
-  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, BFI_F));
-  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, BFI_F));
-  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, BFI_F));
+  EXPECT_FALSE(shouldOptimizeForSize(F, &PSI, BFI_F, PGSOQueryType::Test));
+  EXPECT_TRUE(shouldOptimizeForSize(G, &PSI, BFI_G, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(H, &PSI, BFI_H, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(&BB0, &PSI, BFI_F, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(BB1, &PSI, BFI_F, PGSOQueryType::Test));
+  EXPECT_TRUE(shouldOptimizeForSize(BB2, &PSI, BFI_F, PGSOQueryType::Test));
+  EXPECT_FALSE(shouldOptimizeForSize(BB3, &PSI, BFI_F, PGSOQueryType::Test));
 }
 
 const char* SizeOptsTest::IRString = R"IR(
diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp
index 44d5be13751a6..6f5791354ece9 100644
--- a/llvm/utils/FileCheck/FileCheck.cpp
+++ b/llvm/utils/FileCheck/FileCheck.cpp
@@ -108,24 +108,28 @@ static cl::opt<bool> DumpInputOnFailure(
              "FILECHECK_DUMP_INPUT_ON_FAILURE environment variable.\n"
              "This option is deprecated in favor of -dump-input=fail.\n"));
 
+// The order of DumpInputValue members affects their precedence, as documented
+// for -dump-input below.
 enum DumpInputValue {
   DumpInputDefault,
-  DumpInputHelp,
   DumpInputNever,
   DumpInputFail,
-  DumpInputAlways
+  DumpInputAlways,
+  DumpInputHelp
 };
 
-static cl::opt<DumpInputValue> DumpInput(
-    "dump-input", cl::init(DumpInputDefault),
+static cl::list<DumpInputValue> DumpInputs(
+    "dump-input",
     cl::desc("Dump input to stderr, adding annotations representing\n"
-             " currently enabled diagnostics\n"),
+             "currently enabled diagnostics.  When there are multiple\n"
+             "occurrences of this option, the <value> that appears earliest\n"
+             "in the list below has precedence.\n"),
     cl::value_desc("mode"),
     cl::values(clEnumValN(DumpInputHelp, "help",
                           "Explain dump format and quit"),
-               clEnumValN(DumpInputNever, "never", "Never dump input"),
+               clEnumValN(DumpInputAlways, "always", "Always dump input"),
                clEnumValN(DumpInputFail, "fail", "Dump input on failure"),
-               clEnumValN(DumpInputAlways, "always", "Always dump input")));
+               clEnumValN(DumpInputNever, "never", "Never dump input")));
 
 typedef cl::list<std::string>::const_iterator prefix_iterator;
 
@@ -516,6 +520,10 @@ int main(int argc, char **argv) {
   InitLLVM X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv, /*Overview*/ "", /*Errs*/ nullptr,
                               "FILECHECK_OPTS");
+  DumpInputValue DumpInput =
+      DumpInputs.empty()
+          ? DumpInputDefault
+          : *std::max_element(DumpInputs.begin(), DumpInputs.end());
   if (DumpInput == DumpInputHelp) {
     DumpInputAnnotationHelp(outs());
     return 0;
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index f064605109050..f93f8bf1cc75a 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -14,12 +14,17 @@ class string:
 ##### Common utilities for update_*test_checks.py
 
 
+_verbose = False
+
 def parse_commandline_args(parser):
   parser.add_argument('-v', '--verbose', action='store_true',
                       help='Show verbose output')
   parser.add_argument('-u', '--update-only', action='store_true',
                       help='Only update test if it was already autogened')
-  return parser.parse_args()
+  args = parser.parse_args()
+  global _verbose
+  _verbose = args.verbose
+  return args
 
 def should_add_line_to_output(input_line, prefix_set):
   # Skip any blank comment lines in the IR.
@@ -53,10 +58,10 @@ def invoke_tool(exe, cmd_args, ir):
 
 ##### LLVM IR parser
 
-RUN_LINE_RE = re.compile(r'^\s*[;#]\s*RUN:\s*(.*)$')
+RUN_LINE_RE = re.compile(r'^\s*(?://|[;#])\s*RUN:\s*(.*)$')
 CHECK_PREFIX_RE = re.compile(r'--?check-prefix(?:es)?[= ](\S+)')
 PREFIX_RE = re.compile('^[a-zA-Z0-9_-]+$')
-CHECK_RE = re.compile(r'^\s*[;#]\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL|-SAME)?:')
+CHECK_RE = re.compile(r'^\s*(?://|[;#])\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL|-SAME)?:')
 
 OPT_FUNCTION_RE = re.compile(
     r'^\s*define\s+(?:internal\s+)?[^@]*@(?P<func>[\w-]+?)\s*'
@@ -91,6 +96,28 @@ def warn(msg, test_file=None):
     msg = '{}: {}'.format(msg, test_file)
   print('WARNING: {}'.format(msg), file=sys.stderr)
 
+def debug(*args, **kwargs):
+  # Python2 does not allow def debug(*args, file=sys.stderr, **kwargs):
+  if 'file' not in kwargs:
+    kwargs['file'] = sys.stderr
+  if _verbose:
+    print(*args, **kwargs)
+
+def find_run_lines(test, lines):
+  debug('Scanning for RUN lines in test file:', test)
+  raw_lines = [m.group(1)
+               for m in [RUN_LINE_RE.match(l) for l in lines] if m]
+  run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
+  for l in raw_lines[1:]:
+    if run_lines[-1].endswith('\\'):
+      run_lines[-1] = run_lines[-1].rstrip('\\') + ' ' + l
+    else:
+      run_lines.append(l)
+  debug('Found {} RUN lines in {}:'.format(len(run_lines), test))
+  for l in run_lines:
+    debug('  RUN: {}'.format(l))
+  return run_lines
+
 def scrub_body(body):
   # Scrub runs of whitespace out of the assembly, but leave the leading
   # whitespace in place.
diff --git a/llvm/utils/gn/build/mac_sdk.gni b/llvm/utils/gn/build/mac_sdk.gni
index 7999042a7ab5e..8fa75647afd18 100644
--- a/llvm/utils/gn/build/mac_sdk.gni
+++ b/llvm/utils/gn/build/mac_sdk.gni
@@ -1,4 +1,15 @@
+declare_args() {
+  # Set to true if you don't have Xcode installed, but do have the commandline
+  # tools.
+  mac_use_commandline_tools_sdk = false
+}
+
 # Location of the mac sdk.
-# If that's not fixed, might want to shell out to xcrun at gn time to
-# retrieve this, but for now this seems to do the trick.
-mac_sdk_path = "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk"
+# The correct way to do this is to call xcrun (https://reviews.llvm.org/D70835),
+# but that makes `gn gen` take twice as long and almost everyone has Xcode
+# installed.  So require that people who don't have it installed set a gn arg.
+if (mac_use_commandline_tools_sdk) {
+  mac_sdk_path = "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk"
+} else {
+  mac_sdk_path = "/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk"
+}
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
index 2302ca1e37333..dc513a41370e0 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/BUILD.gn
@@ -88,6 +88,7 @@ static_library("clangd") {
     "Selection.cpp",
     "SemanticHighlighting.cpp",
     "SemanticSelection.cpp",
+    "Shutdown.cpp",
     "SourceCode.cpp",
     "TUScheduler.cpp",
     "Threading.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
index 02ba4f91a5e05..993adce4bb5c0 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clangd/refactor/tweaks/BUILD.gn
@@ -14,11 +14,13 @@ source_set("tweaks") {
   sources = [
     "AnnotateHighlightings.cpp",
     "DefineInline.cpp",
+    "DefineOutline.cpp",
     "DumpAST.cpp",
     "ExpandAutoType.cpp",
     "ExpandMacro.cpp",
     "ExtractFunction.cpp",
     "ExtractVariable.cpp",
+    "ObjCLocalizeStringLiteral.cpp",
     "RawStringLiteral.cpp",
     "RemoveUsingNamespace.cpp",
     "SwapIfBranches.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
index 8c1d5e1a030a8..5bc4241e474c0 100644
--- a/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Tooling/BUILD.gn
@@ -19,6 +19,7 @@ static_library("Tooling") {
     "CommonOptionsParser.cpp",
     "CompilationDatabase.cpp",
     "Execution.cpp",
+    "ExpandResponseFilesCompilationDatabase.cpp",
     "FileMatchTrie.cpp",
     "FixIt.cpp",
     "GuessTargetAndModeCompilationDatabase.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 53684d4121639..f0602a7d97c35 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -216,6 +216,7 @@ static_library("builtins") {
   }
 
   if (current_cpu == "x86" || current_cpu == "x64") {
+    sources -= [ "fp_mode.c" ]
     sources += [
       "cpu_model.c",
       "divxc3.c",
@@ -228,6 +229,7 @@ static_library("builtins") {
       "floattixf.c",
       "floatundixf.c",
       "floatuntixf.c",
+      "i386/fp_mode.c",
       "mulxc3.c",
       "powixf2.c",
     ]
diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index 1ee30eeb6d920..c03399193baba 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -50,6 +50,7 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
+    "LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",
     "ANDROID_NDK_VERSION=19",
     "ANDROID_SERIAL_FOR_TESTING=$android_serial_for_testing",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 849eb1e06db00..02913e5d963ab 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -182,6 +182,7 @@ static_library("CodeGen") {
     "TargetSchedule.cpp",
     "TargetSubtargetInfo.cpp",
     "TwoAddressInstructionPass.cpp",
+    "TypePromotion.cpp",
     "UnreachableBlockElim.cpp",
     "ValueTypes.cpp",
     "VirtRegMap.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/BUILD.gn
index ae90ba3c9f776..e363a1edf0c42 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/ARM/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/ARM/BUILD.gn
@@ -69,7 +69,6 @@ static_library("LLVMARMCodeGen") {
     "ARMBasicBlockInfo.cpp",
     "ARMCallLowering.cpp",
     "ARMCallingConv.cpp",
-    "ARMCodeGenPrepare.cpp",
     "ARMConstantIslandPass.cpp",
     "ARMConstantPoolValue.cpp",
     "ARMExpandPseudoInsts.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
index 51c78d26f1db5..b27a460dfba01 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/PowerPC/BUILD.gn
@@ -53,7 +53,7 @@ static_library("LLVMPowerPCCodeGen") {
     "PPCISelDAGToDAG.cpp",
     "PPCISelLowering.cpp",
     "PPCInstrInfo.cpp",
-    "PPCLoopPreIncPrep.cpp",
+    "PPCLoopInstrFormPrep.cpp",
     "PPCLowerMASSVEntries.cpp",
     "PPCMCInstLower.cpp",
     "PPCMIPeephole.cpp",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 9f310a82d3b2a..eec61f4b731c3 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -1105,6 +1105,20 @@ def getDefaultSubstitutions(test, tmpDir, tmpBase, normalize_slashes=False):
             ('%/T', tmpDir.replace('\\', '/')),
             ])
 
+    # "%{/[STpst]:regex_replacement}" should be normalized like "%/[STpst]" but we're
+    # also in a regex replacement context of a s@@@ regex.
+    def regex_escape(s):
+        s = s.replace('@', '\@')
+        s = s.replace('&', '\&')
+        return s
+    substitutions.extend([
+            ('%{/s:regex_replacement}', regex_escape(sourcepath.replace('\\', '/'))),
+            ('%{/S:regex_replacement}', regex_escape(sourcedir.replace('\\', '/'))),
+            ('%{/p:regex_replacement}', regex_escape(sourcedir.replace('\\', '/'))),
+            ('%{/t:regex_replacement}', regex_escape(tmpBase.replace('\\', '/')) + '.tmp'),
+            ('%{/T:regex_replacement}', regex_escape(tmpDir.replace('\\', '/'))),
+            ])
+
     # "%:[STpst]" are normalized paths without colons and without a leading
     # slash.
     substitutions.extend([
diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py
index e8d539a17c407..5fb09515c362a 100755
--- a/llvm/utils/lit/lit/main.py
+++ b/llvm/utils/lit/lit/main.py
@@ -42,8 +42,8 @@ def main(builtin_params = {}):
         maxFailures = opts.maxFailures,
         echo_all_commands = opts.echoAllCommands)
 
-    tests = lit.discovery.find_tests_for_inputs(litConfig, opts.test_paths)
-    if not tests:
+    discovered_tests = lit.discovery.find_tests_for_inputs(litConfig, opts.test_paths)
+    if not discovered_tests:
         sys.stderr.write('Did not disover any tests for provided path(s).\n')
         sys.exit(2)
 
@@ -59,16 +59,15 @@ def main(builtin_params = {}):
             litConfig.maxIndividualTestTime = opts.maxIndividualTestTime
 
     if opts.showSuites or opts.showTests:
-        print_suites_or_tests(tests, opts)
+        print_suites_or_tests(discovered_tests, opts)
         return
 
-    numTotalTests = len(tests)
-
     if opts.filter:
-        tests = [t for t in tests if opts.filter.search(t.getFullName())]
-        if not tests:
+        filtered_tests = [t for t in discovered_tests if
+                          opts.filter.search(t.getFullName())]
+        if not filtered_tests:
             sys.stderr.write('Filter did not match any tests '
-                             '(of %d discovered).  ' % numTotalTests)
+                             '(of %d discovered).  ' % len(discovered_tests))
             if opts.allow_empty_runs:
                 sys.stderr.write('Suppressing error because '
                                  "'--allow-empty-runs' was specified.\n")
@@ -77,34 +76,37 @@ def main(builtin_params = {}):
                 sys.stderr.write("Use '--allow-empty-runs' to suppress this "
                                  'error.\n')
                 sys.exit(2)
+    else:
+        filtered_tests = discovered_tests
 
-    determine_order(tests, opts.order)
+    determine_order(filtered_tests, opts.order)
 
     if opts.shard:
         (run, shards) = opts.shard
-        tests = filter_by_shard(tests, run, shards, litConfig)
-        if not tests:
+        filtered_tests = filter_by_shard(filtered_tests, run, shards, litConfig)
+        if not filtered_tests:
             sys.stderr.write('Shard does not contain any tests.  Consider '
                              'decreasing the number of shards.\n')
             sys.exit(0)
 
     if opts.max_tests:
-        tests = tests[:opts.max_tests]
+        filtered_tests = filtered_tests[:opts.max_tests]
 
-    opts.numWorkers = min(len(tests), opts.numWorkers)
+    opts.numWorkers = min(len(filtered_tests), opts.numWorkers)
 
     start = time.time()
-    run_tests(tests, litConfig, opts, numTotalTests)
+    run_tests(filtered_tests, litConfig, opts, len(discovered_tests))
     elapsed = time.time() - start
 
-    executed_tests = [t for t in tests if t.result]
+    executed_tests = [t for t in filtered_tests if t.result]
 
     print_summary(executed_tests, elapsed, opts)
 
     if opts.output_path:
-        write_test_results(tests, litConfig, elapsed, opts.output_path)
+        #TODO(yln): pass in discovered_tests
+        write_test_results(executed_tests, litConfig, elapsed, opts.output_path)
     if opts.xunit_output_file:
-        write_test_results_xunit(tests, opts)
+        write_test_results_xunit(executed_tests, opts)
 
     if litConfig.numErrors:
         sys.stderr.write('\n%d error(s) in tests.\n' % litConfig.numErrors)
diff --git a/llvm/utils/release/build_llvm_package.bat b/llvm/utils/release/build_llvm_package.bat
index 5b0d71fa72ce2..7556bfa854e3d 100755
--- a/llvm/utils/release/build_llvm_package.bat
+++ b/llvm/utils/release/build_llvm_package.bat
@@ -8,7 +8,7 @@ REM Usage: build_llvm_package.bat <revision>
 
 REM Prerequisites:
 REM
-REM   Visual Studio 2019, CMake, Ninja, SVN, GNUWin32, SWIG, Python 3,
+REM   Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3,
 REM   NSIS with the strlen_8192 patch,
 REM   Visual Studio 2019 SDK and Nuget (for the clang-format plugin),
 REM   Perl (for the OpenMP run-time).
@@ -24,13 +24,13 @@ set vsdevcmd=C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\Co
 set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python36-32
 set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python36
 
+for /f "usebackq" %%i in (`PowerShell ^(Get-Date^).ToString^('yyyyMMdd'^)`) do set datestamp=%%i
+
 set revision=%1
-set branch=trunk
-set package_version=10.0.0-r%revision%
-set clang_format_vs_version=10.0.0.%revision%
+set package_version=10.0.0-%revision%
+set clang_format_vs_version=10.0.0.%datestamp%
 set build_dir=llvm_package_%revision%
 
-echo Branch: %branch%
 echo Revision: %revision%
 echo Package version: %package_version%
 echo Clang format plugin version: %clang_format_vs_version%
@@ -41,28 +41,24 @@ pause
 mkdir %build_dir%
 cd %build_dir%
 
-echo Checking out %branch% at r%revision%...
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/llvm/%branch% llvm || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/cfe/%branch% llvm/tools/clang || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/clang-tools-extra/%branch% llvm/tools/clang/tools/extra || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/lld/%branch% llvm/tools/lld || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/compiler-rt/%branch% llvm/projects/compiler-rt || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/openmp/%branch% llvm/projects/openmp || exit /b
-svn.exe export -r %revision% http://llvm.org/svn/llvm-project/lldb/%branch% llvm/tools/lldb || exit /b
-
+echo Checking out %revision%
+curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b
+7z x src.zip || exit /b
+mv llvm-project-* llvm-project || exit /b
 
 REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
 set cmake_flags=^
-	-DCMAKE_BUILD_TYPE=Release ^
-	-DLLVM_ENABLE_ASSERTIONS=ON ^
-	-DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
-	-DLLVM_BUILD_LLVM_C_DYLIB=ON ^
-	-DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
-	-DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% ^
-	-DPACKAGE_VERSION=%package_version% ^
-	-DLLDB_RELOCATABLE_PYTHON=1 ^
-	-DLLDB_TEST_COMPILER=%cd%\build32_stage0\bin\clang.exe ^
-	-DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: "
+  -DCMAKE_BUILD_TYPE=Release ^
+  -DLLVM_ENABLE_ASSERTIONS=ON ^
+  -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
+  -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
+  -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
+  -DCLANG_FORMAT_VS_VERSION=%clang_format_vs_version% ^
+  -DPACKAGE_VERSION=%package_version% ^
+  -DLLDB_RELOCATABLE_PYTHON=1 ^
+  -DLLDB_TEST_COMPILER=%cd%\build32_stage0\bin\clang.exe ^
+  -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
+  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;openmp;lldb"
 
 REM TODO: Run the "check-all" tests.
 
@@ -72,7 +68,7 @@ set CC=
 set CXX=
 mkdir build32_stage0
 cd build32_stage0
-cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm-project\llvm || exit /b
 ninja all || ninja all || ninja all || exit /b
 ninja check || ninja check || ninja check || exit /b
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b
@@ -86,7 +82,7 @@ mkdir build32
 cd build32
 set CC=..\build32_stage0\bin\clang-cl
 set CXX=..\build32_stage0\bin\clang-cl
-cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm-project\llvm || exit /b
 ninja all || ninja all || ninja all || exit /b
 ninja check || ninja check || ninja check || exit /b
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b
@@ -104,9 +100,9 @@ REM Having VSSDKINSTALL set makes devenv *not* find the SDK for some reason.
 set VSSDKINSTALL=
 set CC=..\build32_stage0\bin\clang-cl
 set CXX=..\build32_stage0\bin\clang-cl
-cmake -GNinja %cmake_flags% -DLLVM_USE_CRT_RELEASE=MT -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DLLVM_USE_CRT_RELEASE=MT -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DPYTHON_HOME=%python32_dir% -DPYTHON_EXECUTABLE=%python32_dir%\python.exe ..\llvm-project\llvm || exit /b
 ninja clang_format_vsix || exit /b
-copy ..\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangFormat.vsix ClangFormat-r%revision%.vsix
+copy ..\llvm-project\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangFormat.vsix ClangFormat-r%revision%.vsix
 cd ..
 
 
@@ -116,7 +112,7 @@ set CC=
 set CXX=
 mkdir build64_stage0
 cd build64_stage0
-cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% -DPYTHON_EXECUTABLE=%python64_dir%\python.exe ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% -DPYTHON_EXECUTABLE=%python64_dir%\python.exe ..\llvm-project\llvm || exit /b
 ninja all || ninja all || ninja all || exit /b
 ninja check || ninja check || ninja check || exit /b
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b
@@ -130,7 +126,7 @@ mkdir build64
 cd build64
 set CC=..\build64_stage0\bin\clang-cl
 set CXX=..\build64_stage0\bin\clang-cl
-cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% -DPYTHON_EXECUTABLE=%python64_dir%\python.exe ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python64_dir% -DPYTHON_EXECUTABLE=%python64_dir%\python.exe ..\llvm-project\llvm || exit /b
 ninja all || ninja all || ninja all || exit /b
 ninja check || ninja check || ninja check || exit /b
 ninja check-clang || ninja check-clang || ninja check-clang || exit /b
diff --git a/llvm/utils/update_analyze_test_checks.py b/llvm/utils/update_analyze_test_checks.py
index 37803656aa208..e3b6dfdf620cf 100755
--- a/llvm/utils/update_analyze_test_checks.py
+++ b/llvm/utils/update_analyze_test_checks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """A script to generate FileCheck statements for 'opt' analysis tests.
 
@@ -69,8 +69,6 @@ def main():
 
   test_paths = [test for pattern in args.tests for test in glob.glob(pattern)]
   for test in test_paths:
-    if args.verbose:
-      print('Scanning for RUN lines in test file: %s' % (test,), file=sys.stderr)
     with open(test) as f:
       input_lines = [l.rstrip() for l in f]
 
@@ -84,20 +82,7 @@ def main():
         common.warn("Skipping test which isn't autogenerated: " + test)
         continue
 
-    raw_lines = [m.group(1)
-                 for m in [common.RUN_LINE_RE.match(l) for l in input_lines] if m]
-    run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-    for l in raw_lines[1:]:
-      if run_lines[-1].endswith("\\"):
-        run_lines[-1] = run_lines[-1].rstrip("\\") + " " + l
-      else:
-        run_lines.append(l)
-
-    if args.verbose:
-      print('Found %d RUN lines:' % (len(run_lines),), file=sys.stderr)
-      for l in run_lines:
-        print('  RUN: ' + l, file=sys.stderr)
-
+    run_lines = common.find_run_lines(test, input_lines)
     prefix_list = []
     for l in run_lines:
       if '|' not in l:
@@ -132,9 +117,8 @@ def main():
       for prefix in prefixes:
         func_dict.update({prefix: dict()})
     for prefixes, opt_args in prefix_list:
-      if args.verbose:
-        print('Extracted opt cmd: ' + opt_basename + ' ' + opt_args, file=sys.stderr)
-        print('Extracted FileCheck prefixes: ' + str(prefixes), file=sys.stderr)
+      common.debug('Extracted opt cmd:', opt_basename, opt_args, file=sys.stderr)
+      common.debug('Extracted FileCheck prefixes:', str(prefixes), file=sys.stderr)
 
       raw_tool_outputs = common.invoke_tool(args.opt_binary, opt_args, test)
 
@@ -147,8 +131,7 @@ def main():
     is_in_function = False
     is_in_function_start = False
     prefix_set = set([prefix for prefixes, _ in prefix_list for prefix in prefixes])
-    if args.verbose:
-      print('Rewriting FileCheck prefixes: %s' % (prefix_set,), file=sys.stderr)
+    common.debug('Rewriting FileCheck prefixes:', str(prefix_set), file=sys.stderr)
     output_lines = []
     output_lines.append(autogenerated_note)
 
@@ -194,8 +177,7 @@ def main():
         continue
       is_in_function = is_in_function_start = True
 
-    if args.verbose:
-      print('Writing %d lines to %s...' % (len(output_lines), test), file=sys.stderr)
+    common.debug('Writing %d lines to %s...' % (len(output_lines), test))
 
     with open(test, 'wb') as f:
       f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
diff --git a/llvm/utils/update_cc_test_checks.py b/llvm/utils/update_cc_test_checks.py
index a3227c67743f4..e251ff0b8f549 100755
--- a/llvm/utils/update_cc_test_checks.py
+++ b/llvm/utils/update_cc_test_checks.py
@@ -28,9 +28,6 @@
 
 ADVERT = '// NOTE: Assertions have been autogenerated by '
 
-CHECK_RE = re.compile(r'^\s*//\s*([^:]+?)(?:-NEXT|-NOT|-DAG|-LABEL)?:')
-RUN_LINE_RE = re.compile(r'^//\s*RUN:\s*(.*)$')
-
 SUBST = {
     '%clang': [],
     '%clang_cc1': ['-cc1'],
@@ -38,9 +35,6 @@
 }
 
 def get_line2spell_and_mangled(args, clang_args):
-  def debug_mangled(*print_args, **kwargs):
-    if args.verbose:
-      print(*print_args, file=sys.stderr, **kwargs)
   ret = {}
   # Use clang's JSON AST dump to get the mangled name
   json_dump_args = [args.clang, *clang_args, '-fsyntax-only', '-o', '-']
@@ -49,38 +43,46 @@ def debug_mangled(*print_args, **kwargs):
     # -Xclang -ast-dump=json instead:
     json_dump_args.append('-Xclang')
   json_dump_args.append('-ast-dump=json')
-  debug_mangled('Running', ' '.join(json_dump_args))
+  common.debug('Running', ' '.join(json_dump_args))
   status = subprocess.run(json_dump_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
   if status.returncode != 0:
     sys.stderr.write('Failed to run ' + ' '.join(json_dump_args) + '\n')
     sys.stderr.write(status.stderr.decode())
     sys.stderr.write(status.stdout.decode())
     sys.exit(2)
-  ast = json.loads(status.stdout.decode())
-  if ast['kind'] != 'TranslationUnitDecl':
-    common.error('Clang AST dump JSON format changed?')
-    sys.exit(2)
 
-  # Get the inner node and iterate over all children of type FunctionDecl.
+  # Parse the clang JSON and add all children of type FunctionDecl.
   # TODO: Should we add checks for global variables being emitted?
-  for node in ast['inner']:
+  def parse_clang_ast_json(node):
+    node_kind = node['kind']
+    # Recurse for the following nodes that can contain nested function decls:
+    if node_kind in ('NamespaceDecl', 'LinkageSpecDecl', 'TranslationUnitDecl'):
+      for inner in node['inner']:
+        parse_clang_ast_json(inner)
+    # Otherwise we ignore everything except functions:
     if node['kind'] != 'FunctionDecl':
-      continue
+      return
     if node.get('isImplicit') is True and node.get('storageClass') == 'extern':
-      debug_mangled('Skipping builtin function:', node['name'], '@', node['loc'])
-      continue
-    debug_mangled('Found function:', node['kind'], node['name'], '@', node['loc'])
+      common.debug('Skipping builtin function:', node['name'], '@', node['loc'])
+      return
+    common.debug('Found function:', node['kind'], node['name'], '@', node['loc'])
     line = node['loc'].get('line')
     # If there is no line it is probably a builtin function -> skip
     if line is None:
-      debug_mangled('Skipping function without line number:', node['name'], '@', node['loc'])
-      continue
+      common.debug('Skipping function without line number:', node['name'], '@', node['loc'])
+      return
     spell = node['name']
     mangled = node.get('mangledName', spell)
     ret[int(line)-1] = (spell, mangled)
-  if args.verbose:
-    for line, func_name in sorted(ret.items()):
-      print('line {}: found function {}'.format(line+1, func_name), file=sys.stderr)
+
+  ast = json.loads(status.stdout.decode())
+  if ast['kind'] != 'TranslationUnitDecl':
+    common.error('Clang AST dump JSON format changed?')
+    sys.exit(2)
+  parse_clang_ast_json(ast)
+
+  for line, func_name in sorted(ret.items()):
+    common.debug('line {}: found function {}'.format(line+1, func_name), file=sys.stderr)
   if not ret:
     common.warn('Did not find any functions using', ' '.join(json_dump_args))
   return ret
@@ -191,19 +193,7 @@ def main():
         continue
 
     # Extract RUN lines.
-    raw_lines = [m.group(1)
-                 for m in [RUN_LINE_RE.match(l) for l in input_lines] if m]
-    run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-    for l in raw_lines[1:]:
-      if run_lines[-1].endswith("\\"):
-        run_lines[-1] = run_lines[-1].rstrip("\\") + " " + l
-      else:
-        run_lines.append(l)
-
-    if args.verbose:
-      print('Found {} RUN lines:'.format(len(run_lines)), file=sys.stderr)
-      for l in run_lines:
-        print('  RUN: ' + l, file=sys.stderr)
+    run_lines = common.find_run_lines(filename, input_lines)
 
     # Build a list of clang command lines and check prefixes from RUN lines.
     run_list = []
@@ -246,7 +236,7 @@ def main():
     input_lines = []
     with open(filename, 'r+') as f:
       for line in f:
-        m = CHECK_RE.match(line)
+        m = common.CHECK_RE.match(line)
         if not (m and m.group(1) in prefix_set) and line != '//\n':
           input_lines.append(line)
       f.seek(0)
@@ -260,9 +250,8 @@ def main():
       for prefix in prefixes:
         func_dict.update({prefix: dict()})
     for prefixes, clang_args, extra_commands, triple_in_cmd in run_list:
-      if args.verbose:
-        print('Extracted clang cmd: clang {}'.format(clang_args), file=sys.stderr)
-        print('Extracted FileCheck prefixes: {}'.format(prefixes), file=sys.stderr)
+      common.debug('Extracted clang cmd: clang {}'.format(clang_args))
+      common.debug('Extracted FileCheck prefixes: {}'.format(prefixes))
 
       get_function_body(args, filename, clang_args, extra_commands, prefixes, triple_in_cmd, func_dict)
 
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 1168eec9a33cf..3e7da8aa06f8f 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """A test case update script.
 
@@ -43,11 +43,9 @@ def main():
 
   test_paths = [test for pattern in args.tests for test in glob.glob(pattern)]
   for test in test_paths:
-    if args.verbose:
-      print('Scanning for RUN lines in test file: %s' % (test,), file=sys.stderr)
     with open(test) as f:
       input_lines = [l.rstrip() for l in f]
-    
+
     first_line = input_lines[0] if input_lines else ""
     if 'autogenerated' in first_line and script_name not in first_line:
       common.warn("Skipping test which wasn't autogenerated by " + script_name, test)
@@ -65,20 +63,7 @@ def main():
         triple_in_ir = m.groups()[0]
         break
 
-    raw_lines = [m.group(1)
-                 for m in [common.RUN_LINE_RE.match(l) for l in input_lines] if m]
-    run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-    for l in raw_lines[1:]:
-      if run_lines[-1].endswith("\\"):
-        run_lines[-1] = run_lines[-1].rstrip("\\") + " " + l
-      else:
-        run_lines.append(l)
-
-    if args.verbose:
-      print('Found %d RUN lines:' % (len(run_lines),), file=sys.stderr)
-      for l in run_lines:
-        print('  RUN: ' + l, file=sys.stderr)
-
+    run_lines = common.find_run_lines(test, input_lines)
     run_list = []
     for l in run_lines:
       if '|' not in l:
@@ -115,12 +100,6 @@ def main():
       llc_cmd_args = llc_cmd_args.replace('< %s', '').replace('%s', '').strip()
       if test.endswith('.mir'):
         llc_cmd_args += ' -x mir'
-        comment_sym = '#'
-        check_indent = '  '
-      else:
-        comment_sym = ';'
-        check_indent = ''
-
       check_prefixes = [item for m in common.CHECK_PREFIX_RE.finditer(filecheck_cmd)
                                for item in m.group(1).split(',')]
       if not check_prefixes:
@@ -130,6 +109,12 @@ def main():
       # now, we just ignore all but the last.
       run_list.append((check_prefixes, llc_cmd_args, triple_in_cmd, march_in_cmd))
 
+    if test.endswith('.mir'):
+      comment_sym = '#'
+      check_indent = '  '
+    else:
+      comment_sym = ';'
+      check_indent = ''
     autogenerated_note = (comment_sym + ADVERT + 'utils/' + script_name)
 
     func_dict = {}
@@ -138,9 +123,8 @@ def main():
       for prefix in prefixes:
         func_dict.update({prefix: dict()})
     for prefixes, llc_args, triple_in_cmd, march_in_cmd in run_list:
-      if args.verbose:
-        print('Extracted LLC cmd: ' + llc_tool + ' ' + llc_args, file=sys.stderr)
-        print('Extracted FileCheck prefixes: ' + str(prefixes), file=sys.stderr)
+      common.debug('Extracted LLC cmd:', llc_tool, llc_args)
+      common.debug('Extracted FileCheck prefixes:', str(prefixes))
 
       raw_tool_output = common.invoke_tool(args.llc_binary, llc_args, test)
       triple = triple_in_cmd or triple_in_ir
@@ -154,8 +138,7 @@ def main():
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
-    if args.verbose:
-      print('Rewriting FileCheck prefixes: %s' % (prefix_set,), file=sys.stderr)
+    common.debug('Rewriting FileCheck prefixes:', str(prefix_set))
     output_lines = []
     output_lines.append(autogenerated_note)
 
@@ -199,8 +182,7 @@ def main():
         continue
       is_in_function = is_in_function_start = True
 
-    if args.verbose:
-      print('Writing %d lines to %s...' % (len(output_lines), test), file=sys.stderr)
+    common.debug('Writing %d lines to %s...' % (len(output_lines), test))
 
     with open(test, 'wb') as f:
       f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
diff --git a/llvm/utils/update_mca_test_checks.py b/llvm/utils/update_mca_test_checks.py
index ba0a99392e02f..c5798a3a8848e 100755
--- a/llvm/utils/update_mca_test_checks.py
+++ b/llvm/utils/update_mca_test_checks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """A test case update script.
 
@@ -83,26 +83,6 @@ def _parse_args():
   return args
 
 
-def _find_run_lines(input_lines, args):
-  raw_lines = [m.group(1)
-               for m in [common.RUN_LINE_RE.match(l) for l in input_lines]
-               if m]
-  run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-  for l in raw_lines[1:]:
-    if run_lines[-1].endswith(r'\\'):
-      run_lines[-1] = run_lines[-1].rstrip('\\') + ' ' + l
-    else:
-      run_lines.append(l)
-
-  if args.verbose:
-    sys.stderr.write('Found {} RUN line{}:\n'.format(
-        len(run_lines), '' if len(run_lines) == 1 else 's'))
-    for line in run_lines:
-      sys.stderr.write('  RUN: {}\n'.format(line))
-
-  return run_lines
-
-
 def _get_run_infos(run_lines, args):
   run_infos = []
   for run_line in run_lines:
@@ -544,9 +524,7 @@ def _write_output(test_path, input_lines, prefix_list, block_infos,  # noqa
     return
   sys.stderr.write('      [{} lines total]\n'.format(len(output_lines)))
 
-  if args.verbose:
-    sys.stderr.write(
-        'Writing {} lines to {}...\n\n'.format(len(output_lines), test_path))
+  common.debug('Writing', len(output_lines), 'lines to', test_path, '..\n\n')
 
   with open(test_path, 'wb') as f:
     f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
@@ -562,17 +540,13 @@ def main():
     # will be written once per source location per test.
     _configure_warnings(args)
 
-    if args.verbose:
-      sys.stderr.write(
-          'Scanning for RUN lines in test file: {}\n'.format(test_path))
-
     if not os.path.isfile(test_path):
       raise Error('could not find test file: {}'.format(test_path))
 
     with open(test_path) as f:
       input_lines = [l.rstrip() for l in f]
 
-    run_lines = _find_run_lines(input_lines, args)
+    run_lines = common.find_run_lines(test_path, input_lines)
     run_infos = _get_run_infos(run_lines, args)
     common_prefix, prefix_pad = _get_useful_prefix_info(run_infos)
     block_infos = _get_block_infos(run_infos, test_path, args, common_prefix)
diff --git a/llvm/utils/update_mir_test_checks.py b/llvm/utils/update_mir_test_checks.py
index 6e90613095850..c1590c55637ef 100755
--- a/llvm/utils/update_mir_test_checks.py
+++ b/llvm/utils/update_mir_test_checks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """Updates FileCheck checks in MIR tests.
 
@@ -96,22 +96,6 @@ def find_triple_in_ir(lines, verbose=False):
     return None
 
 
-def find_run_lines(test, lines, verbose=False):
-    raw_lines = [m.group(1)
-                 for m in [common.RUN_LINE_RE.match(l) for l in lines] if m]
-    run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-    for l in raw_lines[1:]:
-        if run_lines[-1].endswith("\\"):
-            run_lines[-1] = run_lines[-1].rstrip("\\") + " " + l
-        else:
-            run_lines.append(l)
-    if verbose:
-        log('Found {} RUN lines:'.format(len(run_lines)))
-        for l in run_lines:
-            log('  RUN: {}'.format(l))
-    return run_lines
-
-
 def build_run_list(test, run_lines, verbose=False):
     run_list = []
     all_prefixes = []
@@ -296,7 +280,6 @@ def should_add_line_to_output(input_line, prefix_set):
 
 
 def update_test_file(args, test):
-    log('Scanning for RUN lines in test file: {}'.format(test), args.verbose)
     with open(test) as fd:
         input_lines = [l.rstrip() for l in fd]
 
@@ -313,7 +296,7 @@ def update_test_file(args, test):
         return
 
     triple_in_ir = find_triple_in_ir(input_lines, args.verbose)
-    run_lines = find_run_lines(test, input_lines, args.verbose)
+    run_lines = common.find_run_lines(test, input_lines)
     run_list, common_prefixes = build_run_list(test, run_lines, args.verbose)
 
     simple_functions = find_functions_with_one_bb(input_lines, args.verbose)
diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py
index 31122b2f7b755..8ee226549802e 100755
--- a/llvm/utils/update_test_checks.py
+++ b/llvm/utils/update_test_checks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 """A script to generate FileCheck statements for 'opt' regression tests.
 
@@ -84,8 +84,6 @@ def main():
   # On Windows we must expand the patterns ourselves.
   test_paths = [test for pattern in args.tests for test in glob.glob(pattern)]
   for test in test_paths:
-    if args.verbose:
-      print('Scanning for RUN lines in test file: ' + test, file=sys.stderr)
     with open(test) as f:
       input_lines = [l.rstrip() for l in f]
 
@@ -99,20 +97,7 @@ def main():
         common.warn("Skipping test which isn't autogenerated: " + test)
         continue
 
-    raw_lines = [m.group(1)
-                 for m in [common.RUN_LINE_RE.match(l) for l in input_lines] if m]
-    run_lines = [raw_lines[0]] if len(raw_lines) > 0 else []
-    for l in raw_lines[1:]:
-      if run_lines[-1].endswith('\\'):
-        run_lines[-1] = run_lines[-1].rstrip('\\') + ' ' + l
-      else:
-        run_lines.append(l)
-
-    if args.verbose:
-      print('Found %d RUN lines:' % (len(run_lines),), file=sys.stderr)
-      for l in run_lines:
-        print('  RUN: ' + l, file=sys.stderr)
-
+    run_lines = common.find_run_lines(test, input_lines)
     prefix_list = []
     for l in run_lines:
       if '|' not in l:
@@ -146,9 +131,8 @@ def main():
       for prefix in prefixes:
         func_dict.update({prefix: dict()})
     for prefixes, opt_args in prefix_list:
-      if args.verbose:
-        print('Extracted opt cmd: ' + opt_basename + ' ' + opt_args, file=sys.stderr)
-        print('Extracted FileCheck prefixes: ' + str(prefixes), file=sys.stderr)
+      common.debug('Extracted opt cmd: ' + opt_basename + ' ' + opt_args)
+      common.debug('Extracted FileCheck prefixes: ' + str(prefixes))
 
       raw_tool_output = common.invoke_tool(args.opt_binary, opt_args, test)
       common.build_function_body_dictionary(
@@ -159,8 +143,7 @@ def main():
     is_in_function = False
     is_in_function_start = False
     prefix_set = set([prefix for prefixes, _ in prefix_list for prefix in prefixes])
-    if args.verbose:
-      print('Rewriting FileCheck prefixes: %s' % (prefix_set,), file=sys.stderr)
+    common.debug('Rewriting FileCheck prefixes:', str(prefix_set))
     output_lines = []
     output_lines.append(autogenerated_note)
 
@@ -207,8 +190,7 @@ def main():
         continue
       is_in_function = is_in_function_start = True
 
-    if args.verbose:
-      print('Writing %d lines to %s...' % (len(output_lines), test), file=sys.stderr)
+    common.debug('Writing %d lines to %s...' % (len(output_lines), test))
 
     with open(test, 'wb') as f:
       f.writelines(['{}\n'.format(l).encode('utf-8') for l in output_lines])
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
index a953662bf8b4d..c1bc29faaf45d 100644
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -39,6 +39,7 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
 # Once the plugins for the different targets are validated, they will be added to
 # the list of supported targets in the current system.
 set (LIBOMPTARGET_SYSTEM_TARGETS "")
+set (LIBOMPTARGET_TESTED_PLUGINS "")
 
 # Check whether using debug mode. In debug mode, allow dumping progress
 # messages at runtime by default. Otherwise, it can be enabled
diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
index 9723fb8cde347..8bbf987aaf205 100644
--- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt
@@ -6,8 +6,9 @@
 #
 # ##===----------------------------------------------------------------------===##
 #
-# Build a device RTL for each available machine available.
+# Build a device RTL for each available machine.
 #
 ##===----------------------------------------------------------------------===##
 
+add_subdirectory(amdgcn)
 add_subdirectory(nvptx)
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
new file mode 100644
index 0000000000000..6b82b4eccdc83
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt
@@ -0,0 +1,136 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the AMDGCN Device RTL if the ROCM tools are available
+#
+##===----------------------------------------------------------------------===##
+
+find_package(LLVM QUIET CONFIG
+  PATHS
+  $ENV{AOMP}
+  $ENV{HOME}/rocm/aomp
+  /opt/rocm/aomp
+  /usr/lib/rocm/aomp
+  ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR}
+  ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR}
+  ${CMAKE_CXX_COMPILER_DIR}
+  NO_DEFAULT_PATH)
+
+if (LLVM_DIR)
+  libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake")
+else()
+  libomptarget_say("Not building AMDGCN device RTL: AOMP not found")
+  return()
+endif()
+
+set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX})
+
+if (AOMP_INSTALL_PREFIX)
+  set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin)
+else()
+  set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin)
+endif()
+
+libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}")
+
+project(omptarget-amdgcn)
+
+add_custom_target(omptarget-amdgcn ALL)
+
+#optimization level
+set(optimization_level 2)
+
+# Activate RTL message dumps if requested by the user.
+if(LIBOMPTARGET_NVPTX_DEBUG)
+  set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1)
+endif()
+
+get_filename_component(devicertl_base_directory
+  ${CMAKE_CURRENT_SOURCE_DIR}
+  DIRECTORY)
+
+set(cuda_sources
+  ${devicertl_base_directory}/common/src/cancel.cu
+  ${devicertl_base_directory}/common/src/critical.cu)
+
+set(h_files
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
+  ${devicertl_base_directory}/common/debug.h
+  ${devicertl_base_directory}/common/device_environment.h
+  ${devicertl_base_directory}/common/state-queue.h
+  ${devicertl_base_directory}/common/state-queuei.h
+  ${devicertl_base_directory}/common/support.h)
+
+# for both in-tree and out-of-tree build
+if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
+  set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+else()
+  set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
+endif()
+
+# create libraries
+set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
+if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
+  set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
+endif()
+
+macro(add_cuda_bc_library)
+  set(cu_cmd ${AOMP_BINDIR}/clang++
+    -std=c++11
+    -fcuda-rdc
+    -fvisibility=default
+    --cuda-device-only
+    -Wno-unused-value
+    -x hip
+    -O${optimization_level}
+    --cuda-gpu-arch=${mcpu}
+    ${CUDA_DEBUG}
+    -I${CMAKE_CURRENT_SOURCE_DIR}/src
+    -I${devicertl_base_directory})
+
+  set(bc1_files)
+
+  foreach(file ${ARGN})
+    get_filename_component(fname ${file} NAME_WE)
+    set(bc1_filename ${fname}.${mcpu}.bc)
+
+    add_custom_command(
+      OUTPUT ${bc1_filename}
+      COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
+      DEPENDS ${file} ${h_files})
+
+    list(APPEND bc1_files ${bc1_filename})
+  endforeach()
+
+  add_custom_command(
+    OUTPUT linkout.cuda.${mcpu}.bc
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc
+    DEPENDS ${bc1_files})
+
+  list(APPEND bc_files linkout.cuda.${mcpu}.bc)
+endmacro()
+
+set(libname "omptarget-amdgcn")
+
+foreach(mcpu ${mcpus})
+  set(bc_files)
+  add_cuda_bc_library(${cuda_sources})
+
+  set(bc_libname lib${libname}-${mcpu}.bc)
+  add_custom_command(
+    OUTPUT ${bc_libname}
+    COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname}
+    DEPENDS ${bc_files})
+
+  add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
+
+  install(FILES ${OUTPUTDIR}/${bc_libname}
+     DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice"
+  )
+endforeach()
diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
index b2d06ab4b73ab..c6e082c2b961b 100644
--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -72,8 +72,6 @@ EXTERN uint64_t __lanemask_lt();
 // thread's lane number in the warp
 EXTERN uint64_t __lanemask_gt();
 
-EXTERN void llvm_amdgcn_s_barrier();
-
 // CU id
 EXTERN unsigned __smid();
 
@@ -101,25 +99,21 @@ INLINE uint32_t __kmpc_impl_smid() {
   return __smid();
 }
 
-INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __ffsll(x); }
+INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
 
-INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __popcll(x); }
+INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
 INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return __ballot64(1);
 }
 
-INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
-                                     int32_t SrcLane) {
-  return __shfl(Var, SrcLane, WARPSIZE);
-}
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                     int32_t SrcLane);
 
-INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
-                                          uint32_t Delta, int32_t Width) {
-  return __shfl_down(Var, Delta, Width);
-}
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+                                          uint32_t Delta, int32_t Width);
 
-INLINE void __kmpc_impl_syncthreads() { llvm_amdgcn_s_barrier(); }
+INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
 
 INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
   // we have protected the master warp from releasing from its barrier
@@ -128,4 +122,15 @@ INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) {
   __builtin_amdgcn_s_barrier();
 }
 
+// DEVICE versions of part of libc
+extern "C" {
+DEVICE __attribute__((noreturn)) void
+__assertfail(const char *, const char *, unsigned, const char *, size_t);
+INLINE static void __assert_fail(const char *__message, const char *__file,
+                                 unsigned int __line, const char *__function) {
+  __assertfail(__message, __file, __line, __function, sizeof(char));
+}
+DEVICE int printf(const char *, ...);
+}
+
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h
index 3388b04616f4f..8bb4e3a6dd0c4 100644
--- a/openmp/libomptarget/deviceRTLs/common/debug.h
+++ b/openmp/libomptarget/deviceRTLs/common/debug.h
@@ -28,7 +28,7 @@
 #ifndef _OMPTARGET_NVPTX_DEBUG_H_
 #define _OMPTARGET_NVPTX_DEBUG_H_
 
-#include "device_environment.h"
+#include "common/device_environment.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // set desired level of debugging
@@ -128,7 +128,7 @@
 
 #if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
 #include <stdio.h>
-#include "support.h"
+#include "common/support.h"
 
 template <typename... Arguments>
 NOINLINE static void log(const char *fmt, Arguments... parameters) {
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h
similarity index 89%
rename from openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h
rename to openmp/libomptarget/deviceRTLs/common/device_environment.h
index b2f65af354a6d..68a7757d20472 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/device_environment.h
+++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h
@@ -19,6 +19,6 @@ struct omptarget_device_environmentTy {
   int32_t debug_level;
 };
 
-extern __device__ omptarget_device_environmentTy omptarget_device_environment;
+extern DEVICE omptarget_device_environmentTy omptarget_device_environment;
 
 #endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.h b/openmp/libomptarget/deviceRTLs/common/support.h
similarity index 100%
rename from openmp/libomptarget/deviceRTLs/nvptx/src/support.h
rename to openmp/libomptarget/deviceRTLs/common/support.h
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
index 181bceb3e1758..46ed8f4ef343b 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "omptarget-nvptx.h"
-#include "device_environment.h"
+#include "common/device_environment.h"
 
 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
index a446e005c32fe..51e88adee6d0c 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -24,7 +24,7 @@
 #include "common/debug.h"     // debug
 #include "interface.h" // interfaces with omp, compiler, and user
 #include "common/state-queue.h"
-#include "support.h"
+#include "common/support.h"
 
 #define OMPTARGET_NVPTX_VERSION 1.1
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
index c8ac493459c1b..b5efa632b0043 100644
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/support.cu
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "support.h"
+#include "common/support.h"
 #include "common/debug.h"
 #include "omptarget-nvptx.h"
 
diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt
index f8048ba69c01f..bb3f9c908087a 100644
--- a/openmp/libomptarget/plugins/CMakeLists.txt
+++ b/openmp/libomptarget/plugins/CMakeLists.txt
@@ -45,9 +45,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
         dl
         "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports")
     
+      list(APPEND LIBOMPTARGET_TESTED_PLUGINS
+        "omptarget.rtl.${tmachine_libname}")
+
       # Report to the parent scope that we are building a plugin.
       set(LIBOMPTARGET_SYSTEM_TARGETS 
         "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE)
+      set(LIBOMPTARGET_TESTED_PLUGINS
+        "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
       
     else(LIBOMPTARGET_DEP_LIBFFI_FOUND)
       libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.")
@@ -68,4 +73,5 @@ add_subdirectory(x86_64)
 
 # Make sure the parent scope can see the plugins that will be created.
 set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE)
+set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
 
diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt
index 607801e50e5bb..aa3fffcfe60ef 100644
--- a/openmp/libomptarget/test/CMakeLists.txt
+++ b/openmp/libomptarget/test/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
   set(LIBOMPTARGET_DEBUG False)
 endif()
 
-add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp)
+add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS})
 
 # Configure the lit.site.cfg.in file
 set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!")
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index 161a2c6963578..aee1a649741cb 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -379,14 +379,15 @@ void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
       }
       break;
     } else {
-      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
-                     "kmp_sch_static_balanced\n",
-                     gtid));
-      schedule = kmp_sch_static_balanced;
-      /* too few iterations: fall-through to kmp_sch_static_balanced */
+      /* too few chunks: switching to kmp_sch_dynamic_chunked */
+      schedule = kmp_sch_dynamic_chunked;
+      KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
+                     "kmp_sch_dynamic_chunked\n",
+                      gtid));
+      if (pr->u.p.parm1 <= 0)
+        pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
+      break;
     } // if
-    /* FALL-THROUGH to static balanced */
-    KMP_FALLTHROUGH();
   } // case
 #endif
   case kmp_sch_static_balanced: {
diff --git a/openmp/runtime/src/kmp_platform.h b/openmp/runtime/src/kmp_platform.h
index 35e61a9cf3d0c..779c08e9771d5 100644
--- a/openmp/runtime/src/kmp_platform.h
+++ b/openmp/runtime/src/kmp_platform.h
@@ -143,7 +143,7 @@
 #endif
 
 #if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7R__) ||                     \
-    defined(__ARM_ARCH_7A__)
+    defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7VE__)
 #define KMP_ARCH_ARMV7 1
 #endif
 
diff --git a/openmp/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c
new file mode 100644
index 0000000000000..0691353fe59e5
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_dynamic1.c
@@ -0,0 +1,40 @@
+// RUN: %libomp-compile
+// RUN: env OMP_SCHEDULE=nonmonotonic:dynamic,10 %libomp-run
+
+// The test checks iterations distribution for OMP 5.0 nonmonotonic OMP_SCHEDULE
+// case #threads > #chunks (fallback to monotonic dynamic)
+
+#include <stdio.h>
+#include <omp.h>
+
+#define ITERS 100
+#define CHUNK 10
+int err = 0;
+
+int main(int argc, char **argv) {
+  int i, ch, it[ITERS];
+  omp_set_num_threads(16); // #threads is bigger than #chunks
+#pragma omp parallel for schedule(runtime)
+  for (i = 0; i < ITERS; ++i) {
+    it[i] = omp_get_thread_num();
+  }
+  // check that each chunk executed by single thread
+  for (ch = 0; ch < ITERS/CHUNK; ++ch) {
+    int iter = ch * CHUNK;
+    int nt = it[iter]; // thread number
+    for (i = 1; i < CHUNK; ++i) {
+#if _DEBUG
+      printf("iter %d: (%d %d)\n", iter + i, nt, it[iter + i]);
+#endif
+      if (nt != it[iter + i]) {
+        err++;
+      }
+    }
+  }
+  if (err > 0) {
+    printf("Failed, err = %d\n", err);
+    return 1;
+  }
+  printf("Passed\n");
+  return 0;
+}
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index ba2e4819b36d0..8678428d8b4be 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -29,14 +29,14 @@ extern TempRetT __spirv_ImageSampleExplicitLod(SampledType, TempArgT, int,
 
 template <typename dataT>
 extern __ocl_event_t
-__spirv_GroupAsyncCopy(__spv::Scope Execution, __attribute__((ocl_local)) dataT *Dest,
-                       __attribute__((ocl_global)) dataT *Src, size_t NumElements, size_t Stride,
+__spirv_GroupAsyncCopy(__spv::Scope Execution, __attribute__((opencl_local)) dataT *Dest,
+                       __attribute__((opencl_global)) dataT *Src, size_t NumElements, size_t Stride,
                        __ocl_event_t E) noexcept;
 
 template <typename dataT>
 extern __ocl_event_t
-__spirv_GroupAsyncCopy(__spv::Scope Execution, __attribute__((ocl_global)) dataT *Dest,
-                       __attribute__((ocl_local)) dataT *Src, size_t NumElements, size_t Stride,
+__spirv_GroupAsyncCopy(__spv::Scope Execution, __attribute__((opencl_global)) dataT *Dest,
+                       __attribute__((opencl_local)) dataT *Src, size_t NumElements, size_t Stride,
                        __ocl_event_t E) noexcept;
 
 #define OpGroupAsyncCopyGlobalToLocal __spirv_GroupAsyncCopy
@@ -124,7 +124,7 @@ __spirv_GroupAsyncCopy(__spv::Scope Execution, __attribute__((ocl_global)) dataT
     return __spirv_AtomicU##Op(Ptr, Memory, Semantics, Value);                 \
   }
 
-#define __SPIRV_ATOMICS(macro, Arg) macro(__attribute__((ocl_global)), Arg) macro(__attribute__((ocl_local)), Arg)
+#define __SPIRV_ATOMICS(macro, Arg) macro(__attribute__((opencl_global)), Arg) macro(__attribute__((opencl_local)), Arg)
 
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_FLOAT, float)
 __SPIRV_ATOMICS(__SPIRV_ATOMIC_SIGNED, int)
@@ -183,28 +183,28 @@ extern dataT __spirv_SubgroupShuffleXorINTEL(dataT Data,
 
 template <typename dataT>
 extern dataT
-__spirv_SubgroupBlockReadINTEL(const __attribute__((ocl_global))
+__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global))
                                uint8_t *Ptr) noexcept;
 
 template <typename dataT>
-extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((ocl_global))
+extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global))
                                             uint8_t *Ptr,
                                             dataT Data) noexcept;
 
 template <typename dataT>
 extern dataT
-__spirv_SubgroupBlockReadINTEL(const __attribute__((ocl_global)) uint16_t *Ptr) noexcept;
+__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint16_t *Ptr) noexcept;
 
 template <typename dataT>
-extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((ocl_global)) uint16_t *Ptr,
+extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint16_t *Ptr,
                                             dataT Data) noexcept;
 
 template <typename dataT>
 extern dataT
-__spirv_SubgroupBlockReadINTEL(const __attribute__((ocl_global)) uint32_t *Ptr) noexcept;
+__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_global)) uint32_t *Ptr) noexcept;
 
 template <typename dataT>
-extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((ocl_global)) uint32_t *Ptr,
+extern void __spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint32_t *Ptr,
                                             dataT Data) noexcept;
 
 template <typename dataT>
@@ -228,7 +228,7 @@ template <typename dataT>
 extern WPipeTy<dataT> __spirv_CreatePipeFromPipeStorage_write(
     const ConstantPipeStorage *Storage) noexcept;
 
-extern void __spirv_ocl_prefetch(const __attribute__((ocl_global)) char *Ptr,
+extern void __spirv_ocl_prefetch(const __attribute__((opencl_global)) char *Ptr,
                                  size_t NumBytes) noexcept;
 #else // if !__SYCL_DEVICE_ONLY__
 
diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp
index 51ca2e8fcc37e..dd0eaa0dce42e 100644
--- a/sycl/include/CL/__spirv/spirv_vars.hpp
+++ b/sycl/include/CL/__spirv/spirv_vars.hpp
@@ -11,13 +11,13 @@
 #ifdef __SYCL_DEVICE_ONLY__
 
 typedef size_t size_t_vec __attribute__((ext_vector_type(3)));
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInGlobalSize;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInGlobalInvocationId;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInWorkgroupSize;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInNumWorkgroups;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInLocalInvocationId;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInWorkgroupId;
-extern "C" const __attribute__((ocl_constant)) size_t_vec __spirv_BuiltInGlobalOffset;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInGlobalSize;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInGlobalInvocationId;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInWorkgroupSize;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInNumWorkgroups;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInLocalInvocationId;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInWorkgroupId;
+extern "C" const __attribute__((opencl_constant)) size_t_vec __spirv_BuiltInGlobalOffset;
 
 #define DEFINE_INT_ID_TO_XYZ_CONVERTER(POSTFIX)                                \
   template <int ID> static size_t get##POSTFIX();                              \
@@ -39,12 +39,12 @@ DEFINE_INT_ID_TO_XYZ_CONVERTER(GlobalOffset)
 
 #undef DEFINE_INT_ID_TO_XYZ_CONVERTER
 
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupSize;
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupMaxSize;
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInNumSubgroups;
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInNumEnqueuedSubgroups;
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupId;
-extern "C" const __attribute__((ocl_constant)) uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInSubgroupSize;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInSubgroupMaxSize;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInNumSubgroups;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInNumEnqueuedSubgroups;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInSubgroupId;
+extern "C" const __attribute__((opencl_constant)) uint32_t __spirv_BuiltInSubgroupLocalInvocationId;
 
 #define DEFINE_INIT_SIZES(POSTFIX)                                             \
                                                                                \
diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp
index fed4eaef6a385..d4c962c20d3c2 100644
--- a/sycl/include/CL/sycl/access/access.hpp
+++ b/sycl/include/CL/sycl/access/access.hpp
@@ -63,15 +63,15 @@ constexpr bool modeWritesNewData(access::mode m) {
 }
 
 #ifdef __SYCL_DEVICE_ONLY__
-#define SYCL_GLOBAL_AS __attribute__((ocl_global))
-#define SYCL_LOCAL_AS __attribute__((ocl_local))
-#define SYCL_CONSTANT_AS __attribute__((ocl_constant))
-#define SYCL_PRIVATE_AS __attribute__((ocl_private))
+#define __OPENCL_GLOBAL_AS__ __attribute__((opencl_global))
+#define __OPENCL_LOCAL_AS__ __attribute__((opencl_local))
+#define __OPENCL_CONSTANT_AS__ __attribute__((opencl_constant))
+#define __OPENCL_PRIVATE_AS__ __attribute__((opencl_private))
 #else
-#define SYCL_GLOBAL_AS
-#define SYCL_LOCAL_AS
-#define SYCL_CONSTANT_AS
-#define SYCL_PRIVATE_AS
+#define __OPENCL_GLOBAL_AS__
+#define __OPENCL_LOCAL_AS__
+#define __OPENCL_CONSTANT_AS__
+#define __OPENCL_PRIVATE_AS__
 #endif
 
 template <typename dataT, access::target accessTarget>
@@ -79,17 +79,17 @@ struct DeviceValueType;
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::global_buffer> {
-  using type = SYCL_GLOBAL_AS dataT;
+  using type = __OPENCL_GLOBAL_AS__ dataT;
 };
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::constant_buffer> {
-  using type = SYCL_CONSTANT_AS dataT;
+  using type = __OPENCL_CONSTANT_AS__ dataT;
 };
 
 template <typename dataT>
 struct DeviceValueType<dataT, access::target::local> {
-  using type = SYCL_LOCAL_AS dataT;
+  using type = __OPENCL_LOCAL_AS__ dataT;
 };
 
 template <typename dataT>
@@ -117,12 +117,12 @@ struct PtrValueType;
 
 template <typename ElementType>
 struct PtrValueType<ElementType, access::address_space::private_space> {
-  using type = SYCL_PRIVATE_AS ElementType;
+  using type = __OPENCL_PRIVATE_AS__ ElementType;
 };
 
 template <typename ElementType>
 struct PtrValueType<ElementType, access::address_space::global_space> {
-  using type = SYCL_GLOBAL_AS ElementType;
+  using type = __OPENCL_GLOBAL_AS__ ElementType;
 };
 
 template <typename ElementType>
@@ -135,12 +135,12 @@ struct PtrValueType<ElementType, access::address_space::constant_space> {
   // contains function members which return pure ElementType without qualifiers
   // and adding const qualifier here will require adding const casts to
   // multi_ptr methods to remove const qualifiers from underlying pointer type.
-  using type = SYCL_GLOBAL_AS ElementType;
+  using type = __OPENCL_GLOBAL_AS__ ElementType;
 };
 
 template <typename ElementType>
 struct PtrValueType<ElementType, access::address_space::local_space> {
-  using type = SYCL_LOCAL_AS ElementType;
+  using type = __OPENCL_LOCAL_AS__ ElementType;
 };
 
 template <class T>
@@ -150,30 +150,30 @@ struct remove_AS {
 
 #ifdef __SYCL_DEVICE_ONLY__
 template <class T>
-struct remove_AS<SYCL_GLOBAL_AS T> {
+struct remove_AS<__OPENCL_GLOBAL_AS__ T> {
   typedef T type;
 };
 
 template <class T>
-struct remove_AS<SYCL_PRIVATE_AS T> {
+struct remove_AS<__OPENCL_PRIVATE_AS__ T> {
   typedef T type;
 };
 
 template <class T>
-struct remove_AS<SYCL_LOCAL_AS T> {
+struct remove_AS<__OPENCL_LOCAL_AS__ T> {
   typedef T type;
 };
 
 template <class T>
-struct remove_AS<SYCL_CONSTANT_AS T> {
+struct remove_AS<__OPENCL_CONSTANT_AS__ T> {
   typedef T type;
 };
 #endif
 
-#undef SYCL_GLOBAL_AS
-#undef SYCL_LOCAL_AS
-#undef SYCL_CONSTANT_AS
-#undef SYCL_PRIVATE_AS
+#undef __OPENCL_GLOBAL_AS__
+#undef __OPENCL_LOCAL_AS__
+#undef __OPENCL_CONSTANT_AS__
+#undef __OPENCL_PRIVATE_AS__
 
 } // namespace detail
 
diff --git a/sycl/include/CL/sycl/intel/sub_group.hpp b/sycl/include/CL/sycl/intel/sub_group.hpp
index 9692205ae5d91..35e394fe951cd 100644
--- a/sycl/include/CL/sycl/intel/sub_group.hpp
+++ b/sycl/include/CL/sycl/intel/sub_group.hpp
@@ -288,15 +288,15 @@ struct sub_group {
     T data;
     if (sizeof(T) == sizeof(uint32_t)) {
       uint32_t t = __spirv_SubgroupBlockReadINTEL<uint32_t>(
-          (const __attribute__((ocl_global)) uint32_t *)src.get());
+          (const __attribute__((opencl_global)) uint32_t *)src.get());
       data = *((T *)(&t));
     } else if (sizeof(T) == sizeof(uint16_t)) {
       uint16_t t = __spirv_SubgroupBlockReadINTEL<uint16_t>(
-          (const __attribute__((ocl_global)) uint16_t *)src.get());
+          (const __attribute__((opencl_global)) uint16_t *)src.get());
       data = *((T *)(&t));
     } else {
       uint8_t t = __spirv_SubgroupBlockReadINTEL<uint8_t>(
-          (const __attribute__((ocl_global)) uint8_t *)src.get());
+          (const __attribute__((opencl_global)) uint8_t *)src.get());
       data = *((T *)(&t));
     }
     return data;
@@ -316,13 +316,13 @@ struct sub_group {
       typedef uint32_t ocl_t __attribute__((ext_vector_type(N)));
 
       ocl_t t = __spirv_SubgroupBlockReadINTEL<ocl_t>(
-          (const __attribute__((ocl_global)) uint32_t *)src.get());
+          (const __attribute__((opencl_global)) uint32_t *)src.get());
       return *((typename vec<T, N>::vector_t *)(&t));
     }
     typedef uint16_t ocl_t __attribute__((ext_vector_type(N)));
 
     ocl_t t = __spirv_SubgroupBlockReadINTEL<ocl_t>(
-        (const __attribute__((ocl_global)) uint16_t *)src.get());
+        (const __attribute__((opencl_global)) uint16_t *)src.get());
     return *((typename vec<T, N>::vector_t *)(&t));
   }
 
@@ -336,13 +336,13 @@ struct sub_group {
             T>::type &x) const {
     if (sizeof(T) == sizeof(uint32_t)) {
       __spirv_SubgroupBlockWriteINTEL<uint32_t>(
-          (__attribute__((ocl_global)) uint32_t *)dst.get(), *((uint32_t *)&x));
+          (__attribute__((opencl_global)) uint32_t *)dst.get(), *((uint32_t *)&x));
     } else if (sizeof(T) == sizeof(uint16_t)) {
       __spirv_SubgroupBlockWriteINTEL<uint16_t>(
-          (__attribute__((ocl_global)) uint16_t *)dst.get(), *((uint16_t *)&x));
+          (__attribute__((opencl_global)) uint16_t *)dst.get(), *((uint16_t *)&x));
     } else {
       __spirv_SubgroupBlockWriteINTEL<uint8_t>(
-          (__attribute__((ocl_global)) uint8_t *)dst.get(), *((uint8_t *)&x));
+          (__attribute__((opencl_global)) uint8_t *)dst.get(), *((uint8_t *)&x));
     }
   }
 
@@ -363,11 +363,11 @@ struct sub_group {
                 N> &x) const {
     if (sizeof(T) == sizeof(uint32_t)) {
       typedef uint32_t ocl_t __attribute__((ext_vector_type(N)));
-      __spirv_SubgroupBlockWriteINTEL((__attribute__((ocl_global)) uint32_t *)dst.get(),
+      __spirv_SubgroupBlockWriteINTEL((__attribute__((opencl_global)) uint32_t *)dst.get(),
                                              *((ocl_t *)&x));
     } else {
       typedef uint16_t ocl_t __attribute__((ext_vector_type(N)));
-      __spirv_SubgroupBlockWriteINTEL((__attribute__((ocl_global)) uint16_t *)dst.get(),
+      __spirv_SubgroupBlockWriteINTEL((__attribute__((opencl_global)) uint16_t *)dst.get(),
                                              *((ocl_t *)&x));
     }
   }